Skip to content

Commit

Permalink
DCGM-2405. Report on GPUs and GPU instances if MIG and non-MIG GPUs a…
Browse files Browse the repository at this point in the history
…re present.

Signed-off-by: David Beer <dbeer@nvidia.com>
  • Loading branch information
dbeer committed Oct 4, 2021
1 parent ff14e23 commit ecc3f9f
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 23 deletions.
7 changes: 5 additions & 2 deletions cmd/dcgm-exporter/main.go
Expand Up @@ -68,10 +68,13 @@ func main() {
{{.GPUKey}}[:id1[,-id2...] or
{{.GPUInstanceKey}}[:id1[,-id2...].
If an id list is used, then devices with match IDs must exist on the system. For example:
(default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled.
(default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. (See {{.FlexKey}})
{{.GPUKey}} = Monitor all GPUs
{{.GPUInstanceKey}} = Monitor all GPU instances
{{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled
{{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled.
Note: this rule will be applied to each GPU. If it has GPU instances, those
will be monitored. If it doesn't, then the GPU will be monitored.
This is our recommended option for single or mixed MIG Strategies.
{{.GPUKey}}:0,1 = monitor GPUs 0 and 1
{{.GPUInstanceKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4.
Expand Down
42 changes: 23 additions & 19 deletions pkg/dcgmexporter/system_info.go
Expand Up @@ -38,13 +38,13 @@ type GpuInstanceInfo struct {
type GpuInfo struct {
DeviceInfo dcgm.Device
GpuInstances []GpuInstanceInfo
MigEnabled bool
}

type SystemInfo struct {
GpuCount uint
Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo
MigEnabled bool
dOpt DeviceOptions
GpuCount uint
Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo
dOpt DeviceOptions
}

type MonitoringInfo struct {
Expand Down Expand Up @@ -156,6 +156,8 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err
sysInfo.GpuCount = gpuCount

for i := uint(0); i < sysInfo.GpuCount; i++ {
// Default mig enabled to false
sysInfo.Gpus[i].MigEnabled = false
sysInfo.Gpus[i].DeviceInfo, err = dcgm.GetDeviceInfo(i)
if err != nil {
if useFakeGpus {
Expand All @@ -172,11 +174,7 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err
return sysInfo, err
}

if hierarchy.Count == 0 {
sysInfo.MigEnabled = false
} else {
sysInfo.MigEnabled = true

if hierarchy.Count > 0 {
var entities []dcgm.GroupEntityPair

gpuId := uint(0)
Expand All @@ -191,6 +189,7 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err
ProfileName: "",
EntityId: entityId,
}
sysInfo.Gpus[gpuId].MigEnabled = true
sysInfo.Gpus[gpuId].GpuInstances = append(sysInfo.Gpus[gpuId].GpuInstances, instanceInfo)
entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId})
instanceIndex = len(sysInfo.Gpus[gpuId].GpuInstances) - 1
Expand Down Expand Up @@ -246,17 +245,26 @@ func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo {
return monitoring
}

func AddAllGpuInstances(sysInfo SystemInfo) []MonitoringInfo {
func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
var monitoring []MonitoringInfo

for i := uint(0); i < sysInfo.GpuCount; i++ {
for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ {
if addFlexibly == true && len(sysInfo.Gpus[i].GpuInstances) == 0 {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId},
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
&sysInfo.Gpus[i].GpuInstances[j],
nil,
}
monitoring = append(monitoring, mi)
} else {
for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId},
sysInfo.Gpus[i].DeviceInfo,
&sysInfo.Gpus[i].GpuInstances[j],
}
monitoring = append(monitoring, mi)
}
}
}

Expand Down Expand Up @@ -297,11 +305,7 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
var monitoring []MonitoringInfo

if sysInfo.dOpt.Flex == true {
if sysInfo.MigEnabled == true {
return AddAllGpuInstances(sysInfo)
} else {
return AddAllGpus(sysInfo)
}
return AddAllGpuInstances(sysInfo, true)
} else {
if len(sysInfo.dOpt.GpuRange) > 0 && sysInfo.dOpt.GpuRange[0] == -1 {
return AddAllGpus(sysInfo)
Expand All @@ -313,7 +317,7 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
}

if len(sysInfo.dOpt.GpuInstanceRange) > 0 && sysInfo.dOpt.GpuInstanceRange[0] == -1 {
return AddAllGpuInstances(sysInfo)
return AddAllGpuInstances(sysInfo, false)
} else {
for _, gpuInstanceId := range sysInfo.dOpt.GpuInstanceRange {
// We've already verified that everything in the options list exists
Expand Down
4 changes: 2 additions & 2 deletions pkg/dcgmexporter/system_info_test.go
Expand Up @@ -30,7 +30,6 @@ const (
func SpoofSystemInfo() SystemInfo {
var sysInfo SystemInfo
sysInfo.GpuCount = 2
sysInfo.MigEnabled = true
sysInfo.Gpus[0].DeviceInfo.GPU = 0
gi := GpuInstanceInfo{
Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3},
Expand Down Expand Up @@ -74,7 +73,8 @@ func TestMonitoredEntities(t *testing.T) {
require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount)
require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount)

sysInfo.MigEnabled = false // we are now monitoring the GPUs
sysInfo.Gpus[0].GpuInstances = sysInfo.Gpus[0].GpuInstances[:0]
sysInfo.Gpus[1].GpuInstances = sysInfo.Gpus[1].GpuInstances[:0]
monitoring = GetMonitoredEntities(sysInfo)
require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
for i, mi := range monitoring {
Expand Down

0 comments on commit ecc3f9f

Please sign in to comment.