Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion tests/e2e/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs

GINKGO_BIN := $(CURDIR)/bin/ginkgo

# If GINKGO_FOCUS is not set, run all tests
# current available tests:
# - nvidia-container-cli
# - docker
GINKGO_FOCUS ?=

test: $(GINKGO_BIN)
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/...

# test-preinstalled runs the test cases against the version of the toolkit that
# is already installed (and configured for docker) on the host.
Expand Down
2 changes: 0 additions & 2 deletions tests/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ func getTestEnv() {

if installCTK {
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")

imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")

}

sshHost = getEnvVarOrDefault("E2E_SSH_HOST", "")
Expand Down
19 changes: 13 additions & 6 deletions tests/e2e/installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,18 @@ var dockerInstallTemplate = `
#! /usr/bin/env bash
set -xe

: ${IMAGE:={{.Image}}}

# Create a temporary directory
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
mkdir -p "$TEMP_DIR"
# if the TEMP_DIR is already set, use it
if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then
TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
else
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt
fi

# if TEMP_DIR does not exist, create it
if [ ! -d "$TEMP_DIR" ]; then
mkdir -p "$TEMP_DIR"
fi

# Given that docker has an init function that checks for the existence of the
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
Expand All @@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged \
-v /var/run/docker.sock:/var/run/docker.sock \
-v "$TEMP_DIR:$TEMP_DIR" \
-v /etc/docker:/config-root \
${IMAGE} \
{{.Image}} \
--root "$TEMP_DIR" \
--runtime=docker \
--config=/config-root/daemon.json \
Expand Down
234 changes: 234 additions & 0 deletions tests/e2e/nvidia-container-cli_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package e2e

import (
"context"
"fmt"
"strings"
"text/template"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

const (
installDockerTemplate = `
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: this is now a script and not a template.

export DEBIAN_FRONTEND=noninteractive

# Add Docker official GPG key:
apt-get update
apt-get install -y ca-certificates curl apt-utils gnupg2
install -m 0755 -d /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
chmod a+r /etc/apt/keyrings/docker.asc

# Add the repository to Apt sources:
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
apt-get update

apt-get install -y docker-ce docker-ce-cli containerd.io

# start dockerd in the background
dockerd &

# wait for dockerd to be ready with timeout
timeout=30
elapsed=0
while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
echo "Waiting for dockerd to be ready..."
sleep 1
elapsed=$((elapsed + 1))
done
if [ $elapsed -ge $timeout ]; then
echo "Docker failed to start within $timeout seconds"
exit 1
fi
`
installCTKTemplate = `
# Create a temporary directory and rootfs path
TMPDIR="$(mktemp -d)"

# Expose TMPDIR for the child namespace
export TMPDIR

docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb

nvidia-container-cli --version
`

libnvidiaContainerCliTestTemplate = `
# Create a temporary directory and rootfs path
TMPDIR="$(mktemp -d)"
ROOTFS="${TMPDIR}/rootfs"
mkdir -p "${ROOTFS}"

# Expose ROOTFS for the child namespace
export ROOTFS TMPDIR

# Download Ubuntu base image with error handling
curl -fsSL http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz | tar -C $ROOTFS -xz || {
echo "Failed to download or extract Ubuntu base image"
exit 1
}

# Enter a new mount + PID namespace so we can pivot_root without touching the
# container'\''s original filesystem.
unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\''
: "${ROOTFS:?}"

# 1 Bind-mount the new root and make the mount private
mount --bind "$ROOTFS" "$ROOTFS"
mount --make-private "$ROOTFS"
cd "$ROOTFS"

# 2 Minimal virtual filesystems
mount -t proc proc proc
mount -t sysfs sys sys
mount -t tmpfs tmp tmp
mount -t tmpfs run run

# 3 Configure NVIDIA devices
nvidia-container-cli --load-kmods configure --ldconfig=@/sbin/ldconfig.real --no-cgroups --utility --device 0 $(pwd)

# 4 Switch root into the prepared filesystem
pivot_root . mnt
umount -l mnt
nvidia-smi -L

IN_NS
`

startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
-e NVIDIA_DRIVER_CAPABILITIES=all \
{{ range $i, $a := .AdditionalArguments -}}
{{ $a }} \
{{ end -}}
ubuntu sleep infinity`
)

var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() {
var (
runner Runner
containerName = "node-container-e2e"
hostOutput string
additionalContainerArguments []string
)

BeforeAll(func(ctx context.Context) {
runner = NewRunner(
WithHost(sshHost),
WithPort(sshPort),
WithSshKey(sshKey),
WithSshUser(sshUser),
)

if installCTK {
installer, err := NewToolkitInstaller(
WithRunner(runner),
WithImage(imageName+":"+imageTag),
WithTemplate(dockerInstallTemplate),
)
Expect(err).ToNot(HaveOccurred())

err = installer.Install()
Expect(err).ToNot(HaveOccurred())
} else {
// If installCTK is false, we use the preinstalled toolkit.
// TODO: This should be updated for other distributions and other components of the toolkit.
output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*")
Expect(err).ToNot(HaveOccurred())

output = strings.TrimSpace(output)
Expect(output).ToNot(BeEmpty())

for _, lib := range strings.Split(output, "\n") {
additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib)
}
additionalContainerArguments = append(additionalContainerArguments,
"-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli",
)
}

// Capture the host GPU list.
var err error
hostOutput, _, err = runner.Run("nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())

// Normalize the output once
hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))

// If a container with the same name exists from a previous test run, remove it first.
// Ignore errors as container might not exist
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
})

AfterAll(func(ctx context.Context) {
// Cleanup: remove the container and the temporary script on the host.
// Use || true to ensure cleanup doesn't fail the test
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
})

It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
// Launch the container in detached mode.
var startContainerScriptBuilder strings.Builder
startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate)
Expect(err).ToNot(HaveOccurred())
err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct {
ContainerName string
AdditionalArguments []string
}{
ContainerName: containerName,
AdditionalArguments: additionalContainerArguments,
})
Expect(err).ToNot(HaveOccurred())

startContainerScript := startContainerScriptBuilder.String()
GinkgoLogr.Info("Starting test container", "script", startContainerScript)
_, _, err = runner.Run(startContainerScript)
Expect(err).ToNot(HaveOccurred())

// Install docker in the container.
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate))
Expect(err).ToNot(HaveOccurred())

if installCTK {
// Install nvidia-container-cli in the container.
tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate)
Expect(err).ToNot(HaveOccurred())

var toolkitInstall strings.Builder
err = tmpl.Execute(&toolkitInstall, struct {
ToolkitImage string
}{
ToolkitImage: imageName + ":" + imageTag,
})
Expect(err).ToNot(HaveOccurred())

_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String()))
Expect(err).ToNot(HaveOccurred())
}

// Run the test script in the container.
output, _, err := runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", libnvidiaContainerCliTestTemplate))
Expect(err).ToNot(HaveOccurred())
Expect(strings.TrimSpace(output)).ToNot(BeEmpty())
Expect(hostOutput).To(ContainSubstring(strings.TrimSpace(output)))
})
})