Skip to content
This repository has been archived by the owner on Nov 2, 2021. It is now read-only.

Commit

Permalink
Merge branch 'dbeer/dcgm2103' into 'master'
Browse files Browse the repository at this point in the history
DCGM-2103. Add configuration parameters to control what devices are monitored.

See merge request nvidia/container-toolkit/gpu-monitoring-tools!60
  • Loading branch information
dbeer committed Apr 12, 2021
2 parents ca22be5 + f65e418 commit c80add0
Show file tree
Hide file tree
Showing 14 changed files with 833 additions and 111 deletions.
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@ DOCKER ?= docker
MKDIR ?= mkdir
REGISTRY ?= nvidia

DCGM_VERSION := 2.1.4
DCGM_VERSION := 2.1.8
GOLANG_VERSION := 1.14.2
VERSION := 2.3.1
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)

NON_TEST_FILES := pkg/dcgm.go pkg/gpu_collector.go pkg/parser.go pkg/pipeline.go pkg/server.go pkg/system_info.go pkg/types.go pkg/utils.go pkg/kubernetes.go pkg/main.go
MAIN_TEST_FILES := pkg/system_info_test.go

.PHONY: all binary install check-format
all: ubuntu18.04 ubuntu20.04 ubi8

Expand Down Expand Up @@ -52,6 +55,9 @@ push-latest:
$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest"

test-main: $(NON_TEST_FILES) $(MAIN_TEST_FILES)
go test pkg/system_info_test.go pkg/system_info.go pkg/types.go

ubuntu20.04:
$(DOCKER) build --pull \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
Expand All @@ -73,3 +79,4 @@ ubi8:
--build-arg "VERSION=$(FULL_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" \
--file docker/Dockerfile.ubi8 .

1 change: 1 addition & 0 deletions bindings/go/dcgm/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type FieldValue_v2 struct {
Status int
Ts int64
Value [4096]byte
StringValue *string
}

const (
Expand Down
86 changes: 75 additions & 11 deletions bindings/go/dcgm/fields.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package dcgm
import "C"
import (
"fmt"
"unicode"
"unsafe"
)

Expand All @@ -16,6 +17,16 @@ const (
maxKeepSamples = 0 // nolimit
)

type FieldMeta struct {
FieldId Short
FieldType byte
Size byte
Tag string
Scope int
NvmlFieldId int
EntityLevel Field_Entity_Group
}

type FieldHandle struct{ handle C.dcgmFieldGrp_t }

func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error) {
Expand Down Expand Up @@ -103,7 +114,7 @@ func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields
}

func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error) {
values := make([]C.dcgmFieldValue_v2, len(fields))
values := make([]C.dcgmFieldValue_v2, len(fields)*len(entities))
cfields := (*C.ushort)(unsafe.Pointer(&fields[0]))
cEntities := make([]C.dcgmGroupEntityPair_t, len(entities))
cPtrEntities := *(*[]C.dcgmGroupEntityPair_t)(unsafe.Pointer(&cEntities))
Expand Down Expand Up @@ -161,15 +172,30 @@ func (fv FieldValue_v1) Blob() [4096]byte {
func toFieldValue_v2(cfields []C.dcgmFieldValue_v2) []FieldValue_v2 {
fields := make([]FieldValue_v2, len(cfields))
for i, f := range cfields {
fields[i] = FieldValue_v2{
Version: uint(f.version),
EntityGroupId: Field_Entity_Group(f.entityGroupId),
EntityId: uint(f.entityId),
FieldId: uint(f.fieldId),
FieldType: uint(f.fieldType),
Status: int(f.status),
Ts: int64(f.ts),
Value: f.value,
if uint(f.fieldType) == DCGM_FT_STRING {
fields[i] = FieldValue_v2{
Version: uint(f.version),
EntityGroupId: Field_Entity_Group(f.entityGroupId),
EntityId: uint(f.entityId),
FieldId: uint(f.fieldId),
FieldType: uint(f.fieldType),
Status: int(f.status),
Ts: int64(f.ts),
Value: f.value,
StringValue: stringPtr((*C.char)(unsafe.Pointer(&f.value[0]))),
}
} else {
fields[i] = FieldValue_v2{
Version: uint(f.version),
EntityGroupId: Field_Entity_Group(f.entityGroupId),
EntityId: uint(f.entityId),
FieldId: uint(f.fieldId),
FieldType: uint(f.fieldType),
Status: int(f.status),
Ts: int64(f.ts),
Value: f.value,
StringValue: nil,
}
}
}

Expand All @@ -184,10 +210,48 @@ func Fv2_Float64(fv FieldValue_v2) float64 {
return *(*float64)(unsafe.Pointer(&fv.Value[0]))
}

func FindFirstNonAsciiIndex(value [4096]byte) int {
for i := 0; i < 4096; i++ {
if value[i] > unicode.MaxASCII || value[i] < 33 {
return i
}
}

return 4096
}

func Fv2_String(fv FieldValue_v2) string {
return *(*string)(unsafe.Pointer(&fv.Value[0]))
if fv.FieldType == DCGM_FT_STRING {
return *fv.StringValue
} else {
return string(fv.Value[:])
}
}

func Fv2_Blob(fv FieldValue_v2) [4096]byte {
return fv.Value
}

func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta {
return FieldMeta{
FieldId: Short(fieldInfo.fieldId),
FieldType: byte(fieldInfo.fieldType),
Size: byte(fieldInfo.size),
Tag: *stringPtr((*C.char)(unsafe.Pointer(&fieldInfo.tag[0]))),
Scope: int(fieldInfo.scope),
NvmlFieldId: int(fieldInfo.nvmlFieldId),
EntityLevel: Field_Entity_Group(fieldInfo.entityLevel),
}
}

func FieldGetById(fieldId Short) FieldMeta {
return ToFieldMeta(C.DcgmFieldGetById(C.ushort(fieldId)))
}

func FieldsInit() int {
return int(C.DcgmFieldsInit())
}

func FieldsTerm() int {
return int(C.DcgmFieldsTerm())
}
2 changes: 0 additions & 2 deletions bindings/go/dcgm/mig.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ import (
"unsafe"
)

type MigHandle struct{ handle C.dcgmGpuGrp_t }

type Field_Entity_Group uint

const (
Expand Down
4 changes: 2 additions & 2 deletions deployment/dcgm-exporter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]
# Use "-n" to remove the hostname tag from the output.
# Example arguments: ["-n"]
# Use "-d" to specify the devices to monitor. -d must be followed by a string
# in the following format: [f] or [g[:numeric_range][,]][i[:numeric_range]]
# in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]]
# Where a numeric range is something like 0-4 or 0,2,4, etc.
# Example arguments: ["-d", "g,i"] to monitor all GPUs and GPU instances or
# Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or
# ["-d", "g:0-3"] to monitor GPUs 0-3.

imagePullSecrets: []
Expand Down
16 changes: 14 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,23 @@ module github.com/NVIDIA/gpu-monitoring-tools
go 1.14

require (
github.com/Masterminds/semver v1.5.0 // indirect
github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18 // indirect
github.com/go-delve/delve v1.6.0 // indirect
github.com/gorilla/mux v1.7.4
github.com/sirupsen/logrus v1.4.2
github.com/mattn/go-colorable v0.1.8 // indirect
github.com/mattn/go-runewidth v0.0.10 // indirect
github.com/peterh/liner v1.2.1 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/russross/blackfriday v2.0.0+incompatible // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sirupsen/logrus v1.8.1
github.com/spf13/cobra v1.1.3 // indirect
github.com/stretchr/testify v1.5.1
github.com/urfave/cli/v2 v2.2.0
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 // indirect
go.starlark.net v0.0.0-20210312235212-74c10e2c17dc // indirect
golang.org/x/arch v0.0.0-20210324142154-d48d9c4a19f6 // indirect
golang.org/x/sys v0.0.0-20210324051608-47abb6519492 // indirect
google.golang.org/grpc v1.28.1
k8s.io/kubernetes v1.18.2
)
Expand Down

0 comments on commit c80add0

Please sign in to comment.