diff --git a/cmd/dcgm-exporter/main.go b/cmd/dcgm-exporter/main.go index 1c028f3c..aa6c7644 100644 --- a/cmd/dcgm-exporter/main.go +++ b/cmd/dcgm-exporter/main.go @@ -68,10 +68,13 @@ func main() { {{.GPUKey}}[:id1[,-id2...] or {{.GPUInstanceKey}}[:id1[,-id2...]. If an id list is used, then devices with match IDs must exist on the system. For example: - (default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. + (default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. (See {{.FlexKey}}) {{.GPUKey}} = Monitor all GPUs {{.GPUInstanceKey}} = Monitor all GPU instances - {{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled + {{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled. + Note: this rule will be applied to each GPU. If it has GPU instances, those + will be monitored. If it doesn't, then the GPU will be monitored. + This is our recommended option for single or mixed MIG Strategies. {{.GPUKey}}:0,1 = monitor GPUs 0 and 1 {{.GPUInstanceKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4. diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go index 4d632895..81fcfdb2 100644 --- a/pkg/dcgmexporter/system_info.go +++ b/pkg/dcgmexporter/system_info.go @@ -38,13 +38,13 @@ type GpuInstanceInfo struct { type GpuInfo struct { DeviceInfo dcgm.Device GpuInstances []GpuInstanceInfo + MigEnabled bool } type SystemInfo struct { - GpuCount uint - Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo - MigEnabled bool - dOpt DeviceOptions + GpuCount uint + Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo + dOpt DeviceOptions } type MonitoringInfo struct { @@ -156,6 +156,8 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err sysInfo.GpuCount = gpuCount for i := uint(0); i < sysInfo.GpuCount; i++ { + // Default mig enabled to false + sysInfo.Gpus[i].MigEnabled = false sysInfo.Gpus[i].DeviceInfo, err = dcgm.GetDeviceInfo(i) if err != nil { if useFakeGpus { @@ -172,11 +174,7 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err return sysInfo, err } - if hierarchy.Count == 0 { - sysInfo.MigEnabled = false - } else { - sysInfo.MigEnabled = true - + if hierarchy.Count > 0 { var entities []dcgm.GroupEntityPair gpuId := uint(0) @@ -191,6 +189,7 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err ProfileName: "", EntityId: entityId, } + sysInfo.Gpus[gpuId].MigEnabled = true sysInfo.Gpus[gpuId].GpuInstances = append(sysInfo.Gpus[gpuId].GpuInstances, instanceInfo) entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId}) instanceIndex = len(sysInfo.Gpus[gpuId].GpuInstances) - 1 @@ -246,17 +245,26 @@ func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo { return monitoring } -func AddAllGpuInstances(sysInfo SystemInfo) []MonitoringInfo { +func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { var monitoring []MonitoringInfo for i := uint(0); i < sysInfo.GpuCount; i++ { - for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ { + if addFlexibly == true && len(sysInfo.Gpus[i].GpuInstances) == 0 { mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId}, + dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU}, sysInfo.Gpus[i].DeviceInfo, - &sysInfo.Gpus[i].GpuInstances[j], + nil, } monitoring = append(monitoring, mi) + } else { + for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ { + mi := MonitoringInfo{ + dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId}, + sysInfo.Gpus[i].DeviceInfo, + &sysInfo.Gpus[i].GpuInstances[j], + } + monitoring = append(monitoring, mi) + } } } @@ -297,11 +305,7 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { var monitoring []MonitoringInfo if sysInfo.dOpt.Flex == true { - if sysInfo.MigEnabled == true { - return AddAllGpuInstances(sysInfo) - } else { - return AddAllGpus(sysInfo) - } + return AddAllGpuInstances(sysInfo, true) } else { if len(sysInfo.dOpt.GpuRange) > 0 && sysInfo.dOpt.GpuRange[0] == -1 { return AddAllGpus(sysInfo) @@ -313,7 +317,7 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { } if len(sysInfo.dOpt.GpuInstanceRange) > 0 && sysInfo.dOpt.GpuInstanceRange[0] == -1 { - return AddAllGpuInstances(sysInfo) + return AddAllGpuInstances(sysInfo, false) } else { for _, gpuInstanceId := range sysInfo.dOpt.GpuInstanceRange { // We've already verified that everything in the options list exists diff --git a/pkg/dcgmexporter/system_info_test.go b/pkg/dcgmexporter/system_info_test.go index 9d7d7bdd..2b679dca 100644 --- a/pkg/dcgmexporter/system_info_test.go +++ b/pkg/dcgmexporter/system_info_test.go @@ -30,7 +30,6 @@ const ( func SpoofSystemInfo() SystemInfo { var sysInfo SystemInfo sysInfo.GpuCount = 2 - sysInfo.MigEnabled = true sysInfo.Gpus[0].DeviceInfo.GPU = 0 gi := GpuInstanceInfo{ Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3}, @@ -74,7 +73,8 @@ func TestMonitoredEntities(t *testing.T) { require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount) require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount) - sysInfo.MigEnabled = false // we are now monitoring the GPUs + sysInfo.Gpus[0].GpuInstances = sysInfo.Gpus[0].GpuInstances[:0] + sysInfo.Gpus[1].GpuInstances = sysInfo.Gpus[1].GpuInstances[:0] monitoring = GetMonitoredEntities(sysInfo) require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring))) for i, mi := range monitoring {