-
Notifications
You must be signed in to change notification settings - Fork 25
/
main.go
85 lines (76 loc) · 3.3 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
package main
import (
"flag"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"log"
"os"
"text/template"
"time"
)
const (
processInfo = `----------------------------------------------------------------------
GPU ID : {{.GPU}}
----------Execution Stats---------------------------------------------
PID : {{.PID}}
Name : {{or .Name "N/A"}}
Start Time : {{.ProcessUtilization.StartTime.String}}
End Time : {{.ProcessUtilization.EndTime.String}}
----------Performance Stats-------------------------------------------
Energy Consumed (Joules) : {{or .ProcessUtilization.EnergyConsumed "N/A"}}
Max GPU Memory Used (bytes) : {{or .Memory.GlobalUsed "N/A"}}
Avg SM Clock (MHz) : {{or .Clocks.Cores "N/A"}}
Avg Memory Clock (MHz) : {{or .Clocks.Memory "N/A"}}
Avg SM Utilization (%) : {{or .GpuUtilization.GPU "N/A"}}
Avg Memory Utilization (%) : {{or .GpuUtilization.Memory "N/A"}}
Avg PCIe Rx Bandwidth (MB) : {{or .PCI.Throughput.Rx "N/A"}}
Avg PCIe Tx Bandwidth (MB) : {{or .PCI.Throughput.Tx "N/A"}}
----------Event Stats-------------------------------------------------
Single Bit ECC Errors : {{or .Memory.ECCErrors.SingleBit "N/A"}}
Double Bit ECC Errors : {{or .Memory.ECCErrors.DoubleBit "N/A"}}
Critical XID Errors : {{.XIDErrors.NumErrors}}
----------Slowdown Stats----------------------------------------------
Due to - Power (%) : {{or .Violations.Power "N/A"}}
- Thermal (%) : {{or .Violations.Thermal "N/A"}}
- Reliability (%) : {{or .Violations.Reliability "N/A"}}
- Board Limit (%) : {{or .Violations.BoardLimit "N/A"}}
- Low Utilization (%) : {{or .Violations.LowUtilization "N/A"}}
- Sync Boost (%) : {{or .Violations.SyncBoost "N/A"}}
----------Process Utilization-----------------------------------------
Avg SM Utilization (%) : {{or .ProcessUtilization.SmUtil "N/A"}}
Avg Memory Utilization (%) : {{or .ProcessUtilization.MemUtil "N/A"}}
----------------------------------------------------------------------
`
)
var process = flag.Uint("pid", 0, "Provide pid to get this process information.")
// NOTE: The "WatchPidFields()" function must be initially called (as root) BEFORE starting the process to be monitored:
// 1. Run as root, for enabling health watches
// sudo dcgmi stats -e
// 2. Start process to be monitored
// 3. Run processInfo. This is equivalent to "dcgmi stats --pid ENTERPID -v"
// go build && ./processInfo -pid PID
func main() {
cleanup, err := dcgm.Init(dcgm.Embedded)
if err != nil {
log.Panicln(err)
}
defer cleanup()
// Request DCGM to start recording stats for GPU process fields
group, err := dcgm.WatchPidFields()
if err != nil {
log.Panicln(err)
}
// Before retrieving process stats, wait few seconds for watches to be enabled and collect data
log.Println("Enabling DCGM watches to start collecting process stats. This may take a few seconds....")
time.Sleep(3000 * time.Millisecond)
flag.Parse()
pidInfo, err := dcgm.GetProcessInfo(group, *process)
if err != nil {
log.Panicln(err)
}
t := template.Must(template.New("Process").Parse(processInfo))
for _, gpu := range pidInfo {
if err = t.Execute(os.Stdout, gpu); err != nil {
log.Panicln("Template error:", err)
}
}
}