Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 256 additions & 19 deletions config/samples/dynamic-config.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,260 @@
metricsTTL: 30d

# default to 'influx', influx v2 line protocol
metricsFormat: json
metricsFormat: influx

alertRules:
- name: GPUTFlopsFull
query: |
SELECT
node,
pool,
uuid,
avg(compute_percentage) AS compute_used
FROM tf_gpu_usage
WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
GROUP BY node, pool, uuid
threshold: 97
evaluationInterval: 30s
consecutiveCount: 4
severity: P1
summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
alertTargetInstance: "{{ .uuid }}"
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
alertRules:
# Worker TFlops throttled alert
- name: WorkerTFlopsThrottled
query: |
SELECT workload, worker, uuid, node, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
FROM tf_worker_usage
WHERE {{ .Conditions }}
GROUP BY workload, worker, uuid, node
HAVING throttled_increase > {{ .Threshold }}
threshold: 0
evaluationInterval: 15s
consecutiveCount: 3
severity: P1
summary: "Worker TFlops Throttled"
description: "Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit"
alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"

# Worker VRAM switching too frequent alert
- name: WorkerVRAMSwitchCountIncreasing
query: |
SELECT workload, worker, uuid, node, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
FROM tf_worker_usage
WHERE {{ .Conditions }}
GROUP BY workload, worker, uuid, node
HAVING switch_increase > {{ .Threshold }}
threshold: 0
evaluationInterval: 2m
consecutiveCount: 1
severity: P1
summary: "Worker VRAM Switch Count Increasing"
description: "Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"

# Worker can not scale up/scheduled alert
- name: WorkerAllocationFailed
query: |
SELECT pool, (MAX(total_allocation_fail_cnt) - MIN(total_allocation_fail_cnt)) as failure_increase,
FROM tf_system_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING failure_increase > {{ .Threshold }}
threshold: 0
evaluationInterval: 30s
consecutiveCount: 1
severity: P1
summary: "Worker allocation failed for GPU Pool {{ .pool }}"
description: "Worker allocation failed, {{ .failure_increase }} times in last 30 seconds for GPU Pool {{ .pool }}"
alertTargetInstance: "{{ .pool }}"
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"

# Single GPU Alerts

# GPU VRAM Full Alert
- name: GPUVRAMFull
query: |
SELECT
node,
pool,
uuid,
avg(memory_percentage) AS memory_used
FROM tf_gpu_usage
WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }}
GROUP BY node, pool, uuid
threshold: 97
evaluationInterval: 30s
consecutiveCount: 2
severity: P1
summary: "GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}"
alertTargetInstance: "{{ .uuid }}"
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"

# GPU TFlops Full Alert
- name: GPUTFlopsFull
query: |
SELECT
node,
pool,
uuid,
avg(compute_percentage) AS compute_used
FROM tf_gpu_usage
WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
GROUP BY node, pool, uuid
threshold: 97
evaluationInterval: 30s
consecutiveCount: 4
severity: P1
summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
alertTargetInstance: "{{ .uuid }}"
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"

# GPU Temperature alert
- name: GPUTemperatureHigh
query: |
SELECT
node,
pool,
uuid,
avg(temperature) AS avg_temperature
FROM tf_gpu_usage
WHERE temperature > {{ .Threshold }} AND {{ .Conditions }}
GROUP BY node, pool, uuid
threshold: 90
evaluationInterval: 30s
consecutiveCount: 3
severity: P1
summary: "GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}"
alertTargetInstance: "{{ .uuid }}"
description: "GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"

# GPU Pool Alerts

# Node TFlops allocation alert
- name: NodeTFlopsAllocationCritical
query: |
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING tflops_available < {{ .Threshold }}
threshold: 5
evaluationInterval: 1m
consecutiveCount: 2
severity: P0
summary: "Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
alertTargetInstance: "{{ .node }}"

- name: NodeTFlopsAllocationWarning
query: |
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING tflops_available < {{ .Threshold }}
threshold: 10
evaluationInterval: 1m
consecutiveCount: 2
severity: P1
summary: "Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
alertTargetInstance: "{{ .node }}"

# Pool TFlops allocation alert - Total
- name: PoolTotalTFlopsAllocationCritical
query: |
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING tflops_available < {{ .Threshold }}
threshold: 5
evaluationInterval: 1m
consecutiveCount: 2
severity: P0
summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%"
description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
alertTargetInstance: "{{ .pool }}"

- name: PoolTotalTFlopsAllocationWarning
query: |
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING tflops_available < {{ .Threshold }}
threshold: 10
evaluationInterval: 1m
consecutiveCount: 2
severity: P1
summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%"
description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
alertTargetInstance: "{{ .pool }}"

# Node VRAM allocation alert
- name: NodeVRAMAllocationCritical
query: |
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING vram_available < {{ .Threshold }}
threshold: 5
evaluationInterval: 1m
consecutiveCount: 2
severity: P1
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
alertTargetInstance: "{{ .node }}"

- name: NodeVRAMAllocationWarning
query: |
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING vram_available < {{ .Threshold }}
threshold: 10
evaluationInterval: 1m
consecutiveCount: 2
severity: P1
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
alertTargetInstance: "{{ .node }}"

# Pool VRAM allocation alert
- name: PoolVRAMAllocationWarning
query: |
SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING vram_available < {{ .Threshold }}
threshold: 10
evaluationInterval: 1m
consecutiveCount: 2
severity: P1
summary: "Pool available VRAM below threshold, remaining {{ .vram_available }}% for {{ .pool }}"
description: "Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
alertTargetInstance: "{{ .pool }}"

# Empty or Idle GPU Alert
- name: EmptyGPU
query: |
SELECT DISTINCT node
FROM tf_node_metrics
WHERE {{ .Conditions }} AND node NOT IN (
SELECT DISTINCT node
FROM tf_worker_usage
WHERE {{ .Conditions }}
)
threshold: 0
evaluationInterval: 5m
consecutiveCount: 2
severity: P2
summary: "Empty GPU without any workload, Node {{ .node }}"
description: "GPU Node {{ .node }} has no workload running, should be decommissioned"
alertTargetInstance: "{{ .node }}"

- name: IdleGPU
query: |
SELECT node, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
FROM tf_gpu_usage
WHERE {{ .Conditions }}
GROUP BY node, pool, uuid
HAVING compute < 1 and vram < {{ .Threshold }};
threshold: 5
evaluationInterval: 10m
consecutiveCount: 3
severity: P2
summary: "Idle GPU found: {{ .uuid }} on Node {{ .node }}"
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
alertTargetInstance: "{{ .uuid }}"
14 changes: 12 additions & 2 deletions internal/cloudprovider/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,16 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi

nodes := make([]tfv1.GPUNodeClaimSpec, 0, bestNumInstances)
for i := int64(0); i < bestNumInstances; i++ {

tflopsQuantity, err := resource.ParseQuantity(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount)))
if err != nil {
return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err)
}

vramQuantity, err := resource.ParseQuantity(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount))
if err != nil {
return nil, fmt.Errorf("failed to parse VRAMOffered: %v", err)
}
nodes = append(nodes, tfv1.GPUNodeClaimSpec{
NodeName: fmt.Sprintf("%s-%s", pool.Name, generateRandomString(8)),
InstanceType: bestInstance.InstanceType,
Expand All @@ -139,8 +149,8 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi
Zone: zone,
CapacityType: preferredCapacityType,

TFlopsOffered: resource.MustParse(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount))),
VRAMOffered: resource.MustParse(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount)),
TFlopsOffered: tflopsQuantity,
VRAMOffered: vramQuantity,
GPUDeviceOffered: bestInstance.GPUCount,

ExtraParams: cluster.Spec.ComputingVendor.Params.ExtraParams,
Expand Down
6 changes: 5 additions & 1 deletion internal/cloudprovider/karpenter/nodeclaim.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,11 @@ func (p KarpenterGPUNodeProvider) buildNodeClaim(ctx context.Context, param *tfv

// Add GPU resources if specified (Karpenter supports nvidia.com/gpu)
if param.GPUDeviceOffered > 0 {
resourceRequests[karpenterConfig.GPUResourceName] = resource.MustParse(fmt.Sprintf("%d", param.GPUDeviceOffered))
quantity, err := resource.ParseQuantity(fmt.Sprintf("%d", param.GPUDeviceOffered))
if err != nil {
return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err)
}
resourceRequests[karpenterConfig.GPUResourceName] = quantity
}

// query nodeClass and build NodeClassRef
Expand Down
3 changes: 0 additions & 3 deletions internal/controller/tensorfusioncluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ import (
"github.com/NexusGPU/tensor-fusion/internal/constants"
"github.com/NexusGPU/tensor-fusion/internal/metrics"
utils "github.com/NexusGPU/tensor-fusion/internal/utils"
corev1 "k8s.io/api/core/v1"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

Expand Down Expand Up @@ -382,7 +381,6 @@ func (r *TensorFusionClusterReconciler) checkTFClusterComponentsReady(ctx contex
constants.LabelKeyOwner: tfc.GetName(),
}))
if err != nil {
r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "CheckComponentStatusError", err.Error())
return false, nil, fmt.Errorf("failed to list GPUPools: %w", err)
}
if len(pools.Items) != len(tfc.Spec.GPUPools) {
Expand Down Expand Up @@ -411,7 +409,6 @@ func (r *TensorFusionClusterReconciler) updateTFClusterStatus(ctx context.Contex
}
}
if err := r.Status().Update(ctx, tfc); err != nil {
r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "UpdateClusterStatusError", err.Error())
return err
}
return nil
Expand Down
1 change: 0 additions & 1 deletion internal/controller/tensorfusionworkload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,6 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
readyCondition.Status = metav1.ConditionFalse
readyCondition.Reason = "WorkerFailed"
readyCondition.Message = fmt.Sprintf("Failed workers num: %d", failedWorkers)
r.Recorder.Eventf(workload, corev1.EventTypeWarning, "WorkerFailed", "Failed workers num: %d", failedWorkers)
} else if workload.Spec.IsDynamicReplica() {
// for dynamic replicas, if no worker failed, indicate workload is running
phase = tfv1.TensorFusionWorkloadPhaseRunning
Expand Down
8 changes: 7 additions & 1 deletion internal/metrics/encoders/influx.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"time"

metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol"
"k8s.io/klog/v2"
)

// InfluxStrategy implements InfluxDB line protocol encoding
Expand All @@ -28,7 +29,12 @@ func (s *InfluxStrategy) AddTag(key, value string) {
}

func (s *InfluxStrategy) AddField(key string, value any) {
s.enc.AddField(key, metricsProto.MustNewValue(value))
v, parsed := metricsProto.NewValue(value)
if !parsed {
klog.Error("metrics influx encoder failed to parse value: ", key, value)
return
}
s.enc.AddField(key, v)
}

func (s *InfluxStrategy) EndLine(timestamp time.Time) {
Expand Down
Loading
Loading