Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions api/v1/gpu_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,19 @@ type RunningAppDetail struct {

// Worker count
Count int `json:"count"`

// Pod names that are running this workload
// +optional
Pods []*PodGPUInfo `json:"pods,omitempty"`
}

type PodGPUInfo struct {
Name string `json:"name,omitempty"`
Namespace string `json:"namespace,omitempty"`
UID string `json:"uid,omitempty"`
Requests Resource `json:"requests,omitempty"`
Limits Resource `json:"limits,omitempty"`
QoS QoSLevel `json:"qos,omitempty"`
}

// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
Expand Down
2 changes: 1 addition & 1 deletion api/v1/gpunode_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
TotalTFlops: initTFlops,
TotalVRAM: initVRAM,
TotalGPUs: initGPUs,
AllocationInfo: []*RunningAppDetail{},
AllocatedPods: make(map[string][]*PodGPUInfo),
LoadedModels: &[]string{},
ManagedGPUDeviceIDs: []string{},
ObservedGeneration: node.Generation,
Expand Down
5 changes: 4 additions & 1 deletion api/v1/gpunode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ type GPUNodeStatus struct {
ObservedGeneration int64 `json:"observedGeneration,omitempty"`

// +optional
AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"`
TotalGPUPods int32 `json:"totalGPUPods,omitempty"`

// +optional
AllocatedPods map[string][]*PodGPUInfo `json:"allocatedPods,omitempty"`
}

// +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying
Expand Down
57 changes: 48 additions & 9 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

90 changes: 75 additions & 15 deletions charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,78 @@ spec:
status:
description: GPUNodeStatus defines the observed state of GPUNode.
properties:
allocationInfo:
items:
properties:
count:
description: Worker count
type: integer
name:
description: Workload name namespace
type: string
namespace:
type: string
required:
- count
type: object
type: array
allocatedPods:
additionalProperties:
items:
properties:
limits:
properties:
compute:
anyOf:
- type: integer
- type: string
description: 0-100 percentage, mutually exclusive with
TFLOPs
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
tflops:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
vram:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
required:
- tflops
- vram
type: object
name:
type: string
namespace:
type: string
qos:
enum:
- low
- medium
- high
- critical
type: string
requests:
properties:
compute:
anyOf:
- type: integer
- type: string
description: 0-100 percentage, mutually exclusive with
TFLOPs
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
tflops:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
vram:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
required:
- tflops
- vram
type: object
uid:
type: string
type: object
type: array
type: object
availableTFlops:
anyOf:
- type: integer
Expand Down Expand Up @@ -221,6 +278,9 @@ spec:
- Unknown
- Destroying
type: string
totalGPUPods:
format: int32
type: integer
totalGPUs:
format: int32
type: integer
Expand Down
71 changes: 71 additions & 0 deletions charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,77 @@ spec:
type: string
namespace:
type: string
pods:
description: Pod names that are running this workload
items:
properties:
limits:
properties:
compute:
anyOf:
- type: integer
- type: string
description: 0-100 percentage, mutually exclusive
with TFLOPs
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
tflops:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
vram:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
required:
- tflops
- vram
type: object
name:
type: string
namespace:
type: string
qos:
enum:
- low
- medium
- high
- critical
type: string
requests:
properties:
compute:
anyOf:
- type: integer
- type: string
description: 0-100 percentage, mutually exclusive
with TFLOPs
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
tflops:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
vram:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
required:
- tflops
- vram
type: object
uid:
type: string
type: object
type: array
required:
- count
type: object
Expand Down
Loading
Loading