This repository has been archived by the owner on Jul 28, 2021. It is now read-only.
/
storage.go
343 lines (309 loc) · 12.2 KB
/
storage.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
package gcs
import (
"bufio"
"context"
"encoding/json"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
"github.com/Microsoft/opengcs/internal/storage"
"github.com/Microsoft/opengcs/internal/storage/overlay"
"github.com/Microsoft/opengcs/internal/storage/scsi"
"github.com/Microsoft/opengcs/service/gcs/prot"
oci "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
const (
// baseFilesPath is the path in the utility VM containing all the files
// that will be used as the base layer for containers.
baseFilesPath = "/tmp/base/"
// mappedDiskMountTimeout is the amount of time before
// mountMappedVirtualDisks will give up trying to mount a device.
mappedDiskMountTimeout = time.Second * 2
)
type mountSpec struct {
Source string
FileSystem string
Flags uintptr
Options []string
}
const (
// From mount(8): Don't load the journal on mounting. Note that if the
// filesystem was not unmounted cleanly, skipping the journal replay will
// lead to the filesystem containing inconsistencies that can lead to any
// number of problems.
mountOptionNoLoad = "noload"
// Enable DAX mode. This turns off the local cache for the file system and
// accesses the storage directly from host memory, reducing memory use
// and increasing sharing across VMs. Only supported on vPMEM devices.
mountOptionDax = "dax"
// For now the file system is hard-coded
defaultFileSystem = "ext4"
)
// Mount mounts the file system to the specified target.
func (ms *mountSpec) Mount(target string) error {
options := strings.Join(ms.Options, ",")
err := syscall.Mount(ms.Source, target, ms.FileSystem, ms.Flags, options)
if err != nil {
return errors.Wrapf(err, "mount %s %s %s 0x%x %s", ms.Source, target, ms.FileSystem, ms.Flags, options)
}
return nil
}
// MountWithTimedRetry attempts mounting multiple times up until the given
// timout. This is necessary because there is a span of time between when the
// device name becomes available under /sys/bus/scsi and when it appears under
// /dev. Once it appears under /dev, there is still a span of time before it
// becomes mountable. Retrying mount should succeed in mounting the device as
// long as it becomes mountable under /dev before the timeout.
func (ms *mountSpec) MountWithTimedRetry(target string) error {
startTime := time.Now()
for {
err := ms.Mount(target)
if err != nil {
if time.Since(startTime) > mappedDiskMountTimeout {
return errors.Wrapf(err, "failed to mount directory %s for mapped virtual disk device %s", target, ms.Source)
}
} else {
break
}
time.Sleep(time.Millisecond * 10)
}
return nil
}
// getLayerMounts computes the mount specs for the scratch and layers.
func (c *gcsCore) getLayerMounts(scratch string, layers []prot.Layer) (scratchMount *mountSpec, layerMounts []*mountSpec, err error) {
layerMounts = make([]*mountSpec, len(layers))
for i, layer := range layers {
deviceName, _, err := c.deviceIDToName(layer.Path)
if err != nil {
return nil, nil, err
}
options := []string{mountOptionNoLoad}
// TODO (dcantah): Add mountOptionDax when supported.
layerMounts[i] = &mountSpec{
Source: deviceName,
FileSystem: defaultFileSystem,
Flags: syscall.MS_RDONLY,
Options: options,
}
}
// An empty scratch value indicates no scratch space is to be attached.
if scratch != "" {
scratchDevice, _, err := c.deviceIDToName(scratch)
if err != nil {
return nil, nil, err
}
scratchMount = &mountSpec{
Source: scratchDevice,
FileSystem: defaultFileSystem,
}
}
return scratchMount, layerMounts, nil
}
// getMappedVirtualDiskMounts uses the Lun values in the given disks to
// retrieve their associated mount spec.
func (c *gcsCore) getMappedVirtualDiskMounts(disks []prot.MappedVirtualDisk) ([]*mountSpec, error) {
devices := make([]*mountSpec, len(disks))
for i, disk := range disks {
device, err := c.scsiLunToName(disk.Lun)
if err != nil {
return nil, errors.Wrapf(err, "failed to get device name for mapped virtual disk %s, lun %d", disk.ContainerPath, disk.Lun)
}
flags := uintptr(0)
var options []string
if disk.ReadOnly {
flags |= syscall.MS_RDONLY
options = append(options, mountOptionNoLoad)
}
devices[i] = &mountSpec{
Source: device,
FileSystem: defaultFileSystem,
Flags: flags,
Options: options,
}
}
return devices, nil
}
// scsiLunToName finds the SCSI device with the given LUN. This assumes
// only one SCSI controller.
func (c *gcsCore) scsiLunToName(lun uint8) (string, error) {
resolveCtx, cancel := context.WithTimeout(context.Background(), time.Second*2)
defer cancel()
return scsi.ControllerLunToName(resolveCtx, 0, lun)
}
// deviceIDToName converts a device ID (scsi:<lun> or pmem:<device#> to a
// device name (/dev/sd? or /dev/pmem?).
// For temporary compatibility, this also accepts just <lun> for SCSI devices.
func (c *gcsCore) deviceIDToName(id string) (device string, pmem bool, err error) {
const (
pmemPrefix = "pmem:"
scsiPrefix = "scsi:"
)
if strings.HasPrefix(id, pmemPrefix) {
return "/dev/pmem" + id[len(pmemPrefix):], true, nil
}
lunStr := id
if strings.HasPrefix(id, scsiPrefix) {
lunStr = id[len(scsiPrefix):]
}
if lun, err := strconv.ParseInt(lunStr, 10, 8); err == nil {
name, err := c.scsiLunToName(uint8(lun))
return name, false, err
}
return "", false, errors.Errorf("unknown device ID %s", id)
}
// mountMappedVirtualDisks mounts the given disks to the given directories,
// with the given options. The device names of each disk are given in a
// parallel slice.
func (c *gcsCore) mountMappedVirtualDisks(disks []prot.MappedVirtualDisk, mounts []*mountSpec) error {
if len(disks) != len(mounts) {
return errors.Errorf("disk and device slices were of different sizes. disks: %d, mounts: %d", len(disks), len(mounts))
}
for i, disk := range disks {
// Don't mount the disk if AttachOnly is specified.
if !disk.AttachOnly {
if !disk.CreateInUtilityVM {
return errors.New("we do not currently support mapping virtual disks inside the container namespace")
}
mount := mounts[i]
if err := os.MkdirAll(disk.ContainerPath, 0700); err != nil {
return errors.Wrapf(err, "failed to create directory for mapped virtual disk %s", disk.ContainerPath)
}
if err := mount.MountWithTimedRetry(disk.ContainerPath); err != nil {
return err
}
}
}
return nil
}
// mountLayers mounts each device into a mountpoint, and then layers them into a
// union filesystem in the given order.
// These mountpoints are all stored under a directory reserved for the container
// with the given index.
func (c *gcsCore) mountLayers(index uint32, scratchMount *mountSpec, layers []*mountSpec) error {
layerPrefix, scratchPath, upperdirPath, workdirPath, rootfsPath := c.getUnioningPaths(index)
logrus.Infof("layerPrefix=%s", layerPrefix)
logrus.Infof("scratchPath:%s", scratchPath)
logrus.Infof("upperdirPath:%s", upperdirPath)
logrus.Infof("workdirPath=%s", workdirPath)
logrus.Infof("rootfsPath=%s", rootfsPath)
// Mount the layer devices.
layerPaths := make([]string, len(layers)+1)
for i, layer := range layers {
layerPath := filepath.Join(layerPrefix, strconv.Itoa(i))
logrus.Infof("layerPath: %s", layerPath)
if err := os.MkdirAll(layerPath, 0700); err != nil {
return errors.Wrapf(err, "failed to create directory for layer %s", layerPath)
}
if err := layer.Mount(layerPath); err != nil {
return errors.Wrapf(err, "failed to mount layer directory %s", layerPath)
}
layerPaths[i+1] = layerPath
}
// TODO: The base path code may be temporary until a more permanent DNS
// solution is reached.
// NOTE: This should probably still always be kept, because otherwise
// mounting will fail when no layer devices are attached. There should
// always be at least one layer, even if it's empty, to prevent this
// from happening.
layerPaths[0] = baseFilesPath
// Mount the layers into a union filesystem.
if err := os.MkdirAll(baseFilesPath, 0700); err != nil {
return errors.Wrapf(err, "failed to create directory for base files %s", baseFilesPath)
}
if err := os.MkdirAll(scratchPath, 0755); err != nil {
return errors.Wrapf(err, "failed to create directory for scratch space %s", scratchPath)
}
if scratchMount != nil {
if err := scratchMount.Mount(scratchPath); err != nil {
return errors.Wrapf(err, "failed to mount scratch directory %s", scratchPath)
}
} else {
// NOTE: V1 has never supported a readonly overlay mount. It was "by
// accident" always getting a writable overlay. Do nothing here if the
// call does not have a scratch path.
}
return overlay.Mount(context.Background(), layerPaths, upperdirPath, workdirPath, rootfsPath, false)
}
// unmountLayers unmounts the union filesystem for the container with the given
// ID, as well as any devices whose mountpoints were layers in that filesystem.
func (c *gcsCore) unmountLayers(index uint32) error {
layerPrefix, scratchPath, _, _, rootfsPath := c.getUnioningPaths(index)
// clean up rootfsPath operations
if err := storage.UnmountPath(context.Background(), rootfsPath, false); err != nil {
return errors.Wrap(err, "failed to unmount root filesytem")
}
// clean up scratchPath operations
if err := storage.UnmountPath(context.Background(), scratchPath, false); err != nil {
return errors.Wrap(err, "failed to unmount scratch")
}
// Clean up layer path operations
layerPaths, err := filepath.Glob(filepath.Join(layerPrefix, "*"))
if err != nil {
return errors.Wrap(err, "failed to get layer paths using Glob")
}
for _, layerPath := range layerPaths {
if err := storage.UnmountPath(context.Background(), layerPath, false); err != nil {
return errors.Wrap(err, "failed to unmount layer")
}
}
return nil
}
// destroyContainerStorage removes any files the GCS stores on disk for the
// container with the given ID.
// These files include directories used for mountpoints in the union filesystem
// and config files.
func (c *gcsCore) destroyContainerStorage(index uint32) error {
if err := os.RemoveAll(c.getContainerStoragePath(index)); err != nil {
return errors.Wrapf(err, "failed to remove container storage path for container %s", c.getContainerIDFromIndex(index))
}
return nil
}
// writeConfigFile writes the given oci.Spec to disk so that it can be consumed
// by an OCI runtime.
func (c *gcsCore) writeConfigFile(index uint32, config *oci.Spec) error {
if config == nil {
return errors.New("failed to write init process config file, no options specified")
}
configPath := c.getConfigPath(index)
if err := os.MkdirAll(filepath.Dir(configPath), 0700); err != nil {
return errors.Wrapf(err, "failed to create config file directory for container %s", c.getContainerIDFromIndex(index))
}
configFile, err := os.Create(configPath)
if err != nil {
return errors.Wrapf(err, "failed to create config file for container %s", c.getContainerIDFromIndex(index))
}
defer configFile.Close()
writer := bufio.NewWriter(configFile)
if err := json.NewEncoder(writer).Encode(config); err != nil {
return errors.Wrapf(err, "failed to write contents of config file for container %s", c.getContainerIDFromIndex(index))
}
if err := writer.Flush(); err != nil {
return errors.Wrapf(err, "failed to flush to config file for container %s", c.getContainerIDFromIndex(index))
}
return nil
}
// getContainerStoragePath returns the path where the GCS stores files on disk
// for the container with the given index.
func (c *gcsCore) getContainerStoragePath(index uint32) string {
return filepath.Join(c.baseStoragePath, strconv.FormatUint(uint64(index), 10))
}
// getUnioningPaths returns paths that will be used in the union filesystem for
// the container with the given index.
func (c *gcsCore) getUnioningPaths(index uint32) (layerPrefix string, scratchPath string, upperdirPath string, workdirPath string, rootfsPath string) {
mountPath := c.getContainerStoragePath(index)
layerPrefix = mountPath
scratchPath = filepath.Join(mountPath, "scratch")
upperdirPath = filepath.Join(mountPath, "scratch", "upper")
workdirPath = filepath.Join(mountPath, "scratch", "work")
rootfsPath = filepath.Join(mountPath, "rootfs")
return
}
// getConfigPath returns the path to the container's config file.
func (c *gcsCore) getConfigPath(index uint32) string {
return filepath.Join(c.getContainerStoragePath(index), "config.json")
}