Skip to content
This repository has been archived by the owner on Jan 10, 2023. It is now read-only.

Commit

Permalink
Systemd support
Browse files Browse the repository at this point in the history
For all containers:
- Mount /run as tmpfs (default size 128 MiB)

For systemd labeled containers (those running and image with the
`com.netflix.titus.systemd` label set to "true"):
- Mount `/run/lock` as its own tmpfs mount
- Tini exec's the container's init command so that it runs as pid 1
- Run them using the standard apparmor and seccomp profiles: no
  CAP_SYS_ADMIN requirement

Other notes:
- This requires that cgroup namespaces are enabled in docker, otherwise
  the systemd container will fail to come up due to not being able to
  create new cgroups.
- Move to Bionic for the systemd test image: the version of systemd that
  ships with it is able to start without CAP_SYS_ADMIN
  • Loading branch information
rgulewich committed Jan 28, 2019
1 parent 1199ace commit 4bb18a7
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Expand Up @@ -32,7 +32,7 @@ jobs:
name: Login into Docker hub
command: docker login -u $DOCKER_USER -p $DOCKER_PASS
- run: ./build.sh ubuntu
- run: ./build.sh ubuntu-systemd-xenial
- run: ./build.sh ubuntu-systemd-bionic
- run: ./build.sh no-entrypoint
- run: ./build.sh shell-entrypoint
- run: ./build.sh ignore-signals
Expand Down
5 changes: 5 additions & 0 deletions executor/mock/jobrunner.go
Expand Up @@ -64,6 +64,8 @@ type JobInput struct {
Tty bool
// MetatronEnabled enables running with the metatron sidecar container
MetatronEnabled bool
// Mem sets the memory resource attribute in MiB
Mem *int64
}

// JobRunResponse returned from RunJob
Expand Down Expand Up @@ -303,6 +305,9 @@ func (jobRunner *JobRunner) StartJob(jobInput *JobInput) *JobRunResponse { // no
cpu = *jobInput.CPU
}
memMiB := int64(400)
if jobInput.Mem != nil {
memMiB = *jobInput.Mem
}
diskMiB := uint64(100)

// Get a reference to the executor and somewhere to stash results
Expand Down
52 changes: 52 additions & 0 deletions executor/mock/standalone/standalone_test.go
Expand Up @@ -85,6 +85,10 @@ var (
name: "titusoss/ubuntu-env-label",
tag: "20180621-1529540359",
}
systemdImage = testImage{
name: "titusoss/ubuntu-systemd-bionic",
tag: "20181219-1545261266",
}
)

// This file still uses log as opposed to using the testing library's built-in logging framework.
Expand Down Expand Up @@ -136,6 +140,9 @@ func TestStandalone(t *testing.T) {
testTtyNegative,
testCachedDockerPull,
testMetatron,
testRunTmpFsMount,
testSmallTmpFsMount,
testSystemdImageMount,
}
for _, fun := range testFunctions {
fullName := runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
Expand Down Expand Up @@ -957,3 +964,48 @@ func testMetatron(t *testing.T, jobID string) {
t.Fail()
}
}

// Test that `/run` is a tmpfs mount, and has the default size
func testRunTmpFsMount(t *testing.T, jobID string) {
var mem int64 = 256
ji := &mock.JobInput{
ImageName: ubuntu.name,
Version: ubuntu.tag,
Mem: &mem,
EntrypointOld: `/bin/bash -c 'findmnt -l -t tmpfs -o target,size | grep -e "/run[^/]" | grep 128M'`,
JobID: jobID,
}
if !mock.RunJobExpectingSuccess(ji) {
t.Fail()
}
}

// Test if a container has a memory limit lower than the default size for `/run`, that the size of `/run` gets limited as well
func testSmallTmpFsMount(t *testing.T, jobID string) {
var mem int64 = 64
ji := &mock.JobInput{
ImageName: ubuntu.name,
Version: ubuntu.tag,
Mem: &mem,
EntrypointOld: `/bin/bash -c 'findmnt -l -t tmpfs -o target,size | grep -e "/run[^/]" | grep 64M'`,
JobID: jobID,
}
if !mock.RunJobExpectingSuccess(ji) {
t.Fail()
}
}

// Test for a container running a systemd labeled image that `/run/lock` is a tmpfs mount, and has the default size
func testSystemdImageMount(t *testing.T, jobID string) {
var mem int64 = 256
ji := &mock.JobInput{
ImageName: systemdImage.name,
Version: systemdImage.tag,
Mem: &mem,
EntrypointOld: `/bin/bash -c 'findmnt -l -t tmpfs -o target,size | grep -e "/run/lock[^/]" | grep 5M'`,
JobID: jobID,
}
if !mock.RunJobExpectingSuccess(ji) {
t.Fail()
}
}
1 change: 1 addition & 0 deletions executor/runtime/container.go
Expand Up @@ -61,6 +61,7 @@ func NewContainer(taskID string, titusInfo *titus.ContainerInfo, resources *runt
TitusInfo: titusInfo,
Resources: resources,
Env: env,
IsSystemD: false,
Labels: labels,
SecurityGroupIDs: networkCfgParams.GetSecurityGroups(),
BandwidthLimitMbps: networkCfgParams.GetBandwidthLimitMbps(),
Expand Down
6 changes: 5 additions & 1 deletion executor/runtime/docker/capabilities.go
Expand Up @@ -33,6 +33,7 @@ func setupAdditionalCapabilities(c *runtimeTypes.Container, hostCfg *container.H

// Privileged containers automaticaly deactivate seccomp and friends, no need to do this
fuseEnabled, err := c.GetFuseEnabled()

if err != nil {
return err
}
Expand All @@ -58,9 +59,12 @@ func setupAdditionalCapabilities(c *runtimeTypes.Container, hostCfg *container.H
if c.TitusInfo.GetAllowNestedContainers() {
apparmorProfile = "docker-nested"
seccompProfile = "nested-container.json"
c.Env["TINI_UNSHARE"] = trueString
}

if c.TitusInfo.GetAllowNestedContainers() || c.IsSystemD {
// Tell Tini to exec systemd so it's pid 1
c.Env["TINI_HANDOFF"] = trueString
c.Env["TINI_UNSHARE"] = trueString
}

if apparmorProfile != "" {
Expand Down
40 changes: 39 additions & 1 deletion executor/runtime/docker/docker.go
Expand Up @@ -59,8 +59,11 @@ const (
builtInDiskBuffer = 1100 // In megabytes, includes extra space for /logs.
defaultNetworkBandwidth = 128 * MB
defaultKillWait = 10 * time.Second
defaultRunTmpFsSize = 128 * MiB
defaultRunLockTmpFsSize = 5 * MiB // The default setting on Ubuntu Xenial
trueString = "true"
jumboFrameParam = "titusParameter.agent.allowNetworkJumbo"
systemdImageLabel = "com.netflix.titus.systemd"
)

const envFileTemplateStr = `
Expand Down Expand Up @@ -468,6 +471,20 @@ func (r *DockerRuntime) dockerConfig(c *runtimeTypes.Container, binds []string,
// Maybe set cfs bandwidth has to be called _after_
maybeSetCFSBandwidth(r.dockerCfg.cfsBandwidthPeriod, c, hostCfg)

// Always setup tmpfs: it's needed to ensure Metatron credentials don't persist across reboots and for SystemD to work
tmpFsSize := int64(defaultRunTmpFsSize)
if hostCfg.Memory < tmpFsSize {
tmpFsSize = hostCfg.Memory
}
hostCfg.Tmpfs = map[string]string{
"/run": fmt.Sprintf("rw,noexec,nosuid,size=%d", tmpFsSize),
}

if c.IsSystemD {
// systemd requires `/run/lock` to be a separate mount from `/run`
hostCfg.Tmpfs["/run/lock"] = fmt.Sprintf("rw,noexec,nosuid,size=%d", defaultRunLockTmpFsSize)
}

if r.storageOptEnabled {
hostCfg.StorageOpt = map[string]string{
"size": fmt.Sprintf("%dM", c.Resources.Disk+builtInDiskBuffer+uint64(imageSize/MiB)),
Expand Down Expand Up @@ -638,6 +655,26 @@ func vpcToolPath() string {
return ret
}

// Use image labels to determine if the container should be configured to run SystemD
func setSystemdRunning(imageInfo types.ImageInspect, c *runtimeTypes.Container) error {
l := log.WithField("imageName", c.QualifiedImageName())

if systemdBool, ok := imageInfo.Config.Labels[systemdImageLabel]; ok {
val, err := strconv.ParseBool(systemdBool)
if err != nil {
return err
}

c.IsSystemD = val
l.Infof("SystemD image label set to %t", val)
return nil
}

l.Info("SystemD image label not set: not configuring container to run SystemD")
c.IsSystemD = false
return nil
}

// This will setup c.Allocation
func prepareNetworkDriver(parentCtx context.Context, cfg Config, c *runtimeTypes.Container) error { // nolint: gocyclo
log.Printf("Configuring VPC network for %s", c.TaskID)
Expand Down Expand Up @@ -910,7 +947,7 @@ func (r *DockerRuntime) Prepare(parentCtx context.Context, c *runtimeTypes.Conta
}

myImageInfo = imageInfo
return nil
return setSystemdRunning(*imageInfo, c)
})

if shouldStartMetatronSync(&r.cfg, c) {
Expand Down Expand Up @@ -1332,6 +1369,7 @@ func (r *DockerRuntime) Start(parentCtx context.Context, c *runtimeTypes.Contain
eventCancel()
return "", nil, statusMessageChan, err
}

err = r.setupEFSMounts(ctx, c, rootFile, containerCred, efsMountInfos)
if err != nil {
eventCancel()
Expand Down
3 changes: 2 additions & 1 deletion executor/runtime/docker/docker_linux.go
Expand Up @@ -187,9 +187,10 @@ func cleanupCgroups(cgroupPath string) error {
}

func setupContainerNesting(parentCtx context.Context, c *runtimeTypes.Container, cred ucred) error {
if !c.TitusInfo.GetAllowNestedContainers() {
if !c.IsSystemD && !c.TitusInfo.GetAllowNestedContainers() {
return nil
}

cgroupPath := filepath.Join("/proc/", strconv.FormatInt(int64(cred.pid), 10), "cgroup")
cgroups, err := ioutil.ReadFile(cgroupPath) // nolint: gosec
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions executor/runtime/types/types.go
Expand Up @@ -127,6 +127,9 @@ type Container struct {
NormalizedENIIndex int
BandwidthLimitMbps uint32

// Is this container meant to run SystemD?
IsSystemD bool

// GPU devices
GPUInfo GPUContainer

Expand Down
@@ -1,5 +1,6 @@
FROM ubuntu:xenial
FROM ubuntu:bionic

LABEL "com.netflix.titus.systemd"="true"
ENV container docker
ENV DEBIAN_FRONTEND noninteractive

Expand Down

0 comments on commit 4bb18a7

Please sign in to comment.