-
Notifications
You must be signed in to change notification settings - Fork 39.6k
/
nvidia_gpu_manager.go
275 lines (249 loc) · 10.2 KB
/
nvidia_gpu_manager.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import (
"fmt"
"io/ioutil"
"os"
"path"
"regexp"
"strings"
"sync"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
"k8s.io/kubernetes/pkg/kubelet/gpu"
)
// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
const (
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
// If the driver installed correctly, the 2 devices will be there.
nvidiaCtlDevice string = "/dev/nvidiactl"
nvidiaUVMDevice string = "/dev/nvidia-uvm"
// Optional device.
nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
devDirectory = "/dev"
nvidiaDeviceRE = `^nvidia[0-9]*$`
nvidiaFullpathRE = `^/dev/nvidia[0-9]*$`
)
type activePodsLister interface {
// Returns a list of active pods on the node.
GetActivePods() []*v1.Pod
}
// nvidiaGPUManager manages nvidia gpu devices.
type nvidiaGPUManager struct {
sync.Mutex
// All gpus available on the Node
allGPUs sets.String
allocated *podGPUs
defaultDevices []string
// The interface which could get GPU mapping from all the containers.
// TODO: Should make this independent of Docker in the future.
dockerClient libdocker.Interface
activePodsLister activePodsLister
}
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient libdocker.Interface) (gpu.GPUManager, error) {
if dockerClient == nil {
return nil, fmt.Errorf("invalid docker client specified")
}
return &nvidiaGPUManager{
allGPUs: sets.NewString(),
dockerClient: dockerClient,
activePodsLister: activePodsLister,
}, nil
}
// Initialize the GPU devices, so far only needed to discover the GPU paths.
func (ngm *nvidiaGPUManager) Start() error {
if ngm.dockerClient == nil {
return fmt.Errorf("Invalid docker client specified in GPU Manager")
}
ngm.Lock()
defer ngm.Unlock()
if _, err := os.Stat(nvidiaCtlDevice); err != nil {
return err
}
if _, err := os.Stat(nvidiaUVMDevice); err != nil {
return err
}
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
_, err := os.Stat(nvidiaUVMToolsDevice)
if !os.IsNotExist(err) {
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
}
if err := ngm.discoverGPUs(); err != nil {
return err
}
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
return nil
}
// Get how many GPU cards we have.
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
return v1.ResourceList{
v1.ResourceNvidiaGPU: *gpus,
}
}
// AllocateGPUs returns `num` GPUs if available, error otherwise.
// Allocation is made thread safe using the following logic.
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
// It is expected that the list of active pods will not return any false positives.
// As part of initialization or allocation, the list of GPUs in use will be computed once.
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
// GPUs are allocated based on the internal list of allocatedGPUs.
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
if gpusNeeded == 0 {
return []string{}, nil
}
ngm.Lock()
defer ngm.Unlock()
if ngm.allocated == nil {
// Initialization is not complete. Try now. Failures can no longer be tolerated.
ngm.allocated = ngm.gpusInUse()
} else {
// update internal list of GPUs in use prior to allocating new GPUs.
ngm.updateAllocatedGPUs()
}
// Check if GPUs have already been allocated. If so return them right away.
// This can happen if a container restarts for example.
if devices := ngm.allocated.getGPUs(string(pod.UID), container.Name); devices != nil {
glog.V(2).Infof("Found pre-allocated GPUs for container %q in Pod %q: %v", container.Name, pod.UID, devices.List())
return append(devices.List(), ngm.defaultDevices...), nil
}
// Get GPU devices in use.
devicesInUse := ngm.allocated.devices()
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
// Get a list of available GPUs.
available := ngm.allGPUs.Difference(devicesInUse)
glog.V(5).Infof("gpus available: %v", available.List())
if int64(available.Len()) < gpusNeeded {
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
}
ret := available.UnsortedList()[:gpusNeeded]
for _, device := range ret {
// Update internal allocated GPU cache.
ngm.allocated.insert(string(pod.UID), container.Name, device)
}
// Add standard devices files that needs to be exposed.
ret = append(ret, ngm.defaultDevices...)
return ret, nil
}
// updateAllocatedGPUs updates the list of GPUs in use.
// It gets a list of active pods and then frees any GPUs that are bound to terminated pods.
// Returns error on failure.
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() {
activePods := ngm.activePodsLister.GetActivePods()
activePodUids := sets.NewString()
for _, pod := range activePods {
activePodUids.Insert(string(pod.UID))
}
allocatedPodUids := ngm.allocated.pods()
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
ngm.allocated.delete(podsToBeRemoved.List())
}
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
// TODO: Without NVML support we only can check whether there has GPU devices, but
// could not give a health check or get more information like GPU cores, memory, or
// family name. Need to support NVML in the future. But we do not need NVML until
// we want more features, features like schedule containers according to GPU family
// name.
func (ngm *nvidiaGPUManager) discoverGPUs() error {
reg := regexp.MustCompile(nvidiaDeviceRE)
files, err := ioutil.ReadDir(devDirectory)
if err != nil {
return err
}
for _, f := range files {
if f.IsDir() {
continue
}
if reg.MatchString(f.Name()) {
glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
}
}
return nil
}
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
func (ngm *nvidiaGPUManager) gpusInUse() *podGPUs {
pods := ngm.activePodsLister.GetActivePods()
type containerIdentifier struct {
id string
name string
}
type podContainers struct {
uid string
containers []containerIdentifier
}
// List of containers to inspect.
podContainersToInspect := []podContainers{}
for _, pod := range pods {
containers := sets.NewString()
for _, container := range pod.Spec.Containers {
// GPUs are expected to be specified only in limits.
if !container.Resources.Limits.NvidiaGPU().IsZero() {
containers.Insert(container.Name)
}
}
// If no GPUs were requested skip this pod.
if containers.Len() == 0 {
continue
}
// TODO: If kubelet restarts right after allocating a GPU to a pod, the container might not have started yet and so container status might not be available yet.
// Use an internal checkpoint instead or try using the CRI if its checkpoint is reliable.
var containersToInspect []containerIdentifier
for _, container := range pod.Status.ContainerStatuses {
if containers.Has(container.Name) {
containersToInspect = append(containersToInspect, containerIdentifier{strings.Replace(container.ContainerID, "docker://", "", 1), container.Name})
}
}
// add the pod and its containers that need to be inspected.
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containersToInspect})
}
ret := newPodGPUs()
for _, podContainer := range podContainersToInspect {
for _, containerIdentifier := range podContainer.containers {
containerJSON, err := ngm.dockerClient.InspectContainer(containerIdentifier.id)
if err != nil {
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerIdentifier.id, podContainer.uid)
continue
}
devices := containerJSON.HostConfig.Devices
if devices == nil {
continue
}
for _, device := range devices {
if isValidPath(device.PathOnHost) {
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
ret.insert(podContainer.uid, containerIdentifier.name, device.PathOnHost)
}
}
}
}
return ret
}
func isValidPath(path string) bool {
return regexp.MustCompile(nvidiaFullpathRE).MatchString(path)
}