Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 8 additions & 19 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ env:

jobs:
build:

runs-on: ubuntu-latest
permissions:
contents: read
Expand Down Expand Up @@ -65,7 +64,6 @@ jobs:
if: github.event_name != 'pull_request'
uses: sigstore/cosign-installer@v3.5.0


# Workaround: https://github.com/docker/build-push-action/issues/461
- name: Setup Docker buildx
uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf
Expand All @@ -88,9 +86,8 @@ jobs:
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push Docker image with Buildx (don't push on PR)
# https://github.com/docker/build-push-action
- name: Build and push Docker image
# Build and push default image (cuda12.8.0)
- name: Build and push Docker image (default cuda12.8.0)
id: build-and-push
uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a
with:
Expand All @@ -99,19 +96,11 @@ jobs:
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

# Build and push specific Docker image for deepep
# https://github.com/docker/build-push-action
- name: Build and push deepep Docker image
id: build-and-push-deepep
uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a
with:
context: .
file: ./docker/Dockerfile.deepep
push: ${{ github.event_name != 'pull_request' }}
tags: ghcr.io/modeltc/lightllm:main-deepep
build-args: |
CUDA_VERSION=12.8.0
ENABLE_DEEPEP=1
ENABLE_NIXL=1
ENABLE_CACHE=1
cache-from: type=gha
cache-to: type=gha,mode=max

Expand All @@ -128,4 +117,4 @@ jobs:
DIGEST: ${{ steps.build-and-push.outputs.digest }}
# This step uses the identity token to provision an ephemeral certificate
# against the sigstore community Fulcio instance.
run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
6 changes: 1 addition & 5 deletions build_and_upload_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,4 @@ IMAGE_TAG=$2
ACCOUNT=$1
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG .
docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG

#deepep
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.deepep -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep .
docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep
docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG
117 changes: 102 additions & 15 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
ARG CUDA_VERSION=12.6.1
ARG CUDA_VERSION=12.8.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04

ARG PYTHON_VERSION=3.10
ARG MAMBA_VERSION=24.7.1-0
ARG VLLM_VERSION=0.11.0
ARG TARGETPLATFORM
ARG ENABLE_DEEPEP=1
ARG ENABLE_NIXL=1
ARG ENABLE_CACHE=1

ENV PATH=/opt/conda/bin:$PATH \
CONDA_PREFIX=/opt/conda

RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
libssl-dev \
curl \
g++ \
make \
git && \
RUN chmod 777 -R /tmp && \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

chmod 777 -R /tmp grants universal read, write, and execute permissions, which is a security risk. A more secure practice is to use chmod 1777 /tmp, which sets the sticky bit. This allows any user to create files in /tmp, but only the file's owner can delete or rename them.

RUN chmod 1777 -R /tmp && \

apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
libssl-dev \
curl \
g++ \
make \
git && \
rm -rf /var/lib/apt/lists/*

RUN case ${TARGETPLATFORM} in \
Expand All @@ -25,24 +33,103 @@ RUN case ${TARGETPLATFORM} in \

RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
esac && \
/opt/conda/bin/conda clean -ya


WORKDIR /root

COPY ./requirements.txt /lightllm/requirements.txt
RUN pip install -U pip
RUN pip install -r /lightllm/requirements.txt --no-cache-dir
RUN pip install --no-cache-dir vllm==${VLLM_VERSION}
RUN pip install https://github.com/ModelTC/LightKernel/releases/download/v1.0.1/lightllm_kernel-0.1.0-cp310-cp310-linux_x86_64.whl

RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*

ENV CUDA_HOME=/usr/local/cuda \
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/

RUN if [ "${ENABLE_CACHE}" = "1" ]; then \
apt-get update && apt-get install -y libboost-dev && rm -rf /var/lib/apt/lists/*; \
LIGHTMEM_REF=5900baf92d85ef4dbda6124093506b0af906011a; \
pip install --no-deps -v "git+https://github.com/ModelTC/LightMem.git@${LIGHTMEM_REF}#egg=light_mem"; \
fi

RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
apt-get update && apt-get install -y wget devscripts debhelper dh-make build-essential dkms && \
apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev && \
rm -rf /var/lib/apt/lists/*; \
mkdir -p /tmp/gdrcopy && cd /tmp \
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
&& cd gdrcopy/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
&& cd / && rm -rf /tmp/gdrcopy; \
fi

# TODO: offline compile
# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
set -e; \
ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
NVSHMEM_VERSION=3.3.9; \
CUDA_ARCHS=90; \
wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
&& cd nvshmem \
&& rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
&& cmake --build build --target install -j64; \
DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
fi

RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel
RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
rm -rf /usr/lib/ucx && rm -rf /opt/hpcx/ucx && \
cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \
cd ucx && \
git checkout v1.19.x && \
./autogen.sh && ./configure \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=/usr/local/cuda \
--with-verbs=yes \
--with-dm \
--with-gdrcopy=/usr/local \
--with-efa \
--enable-mt && \
make -j && \
make -j install-strip && \
ldconfig; \
apt-get update && apt-get install -y pkg-config tmux net-tools && \
cd /usr/local/src; \
pip install --upgrade meson pybind11 patchelf; \
git clone https://github.com/ai-dynamo/nixl.git -b main && \
cd nixl && \
rm -rf build && \
mkdir build && \
meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
cd build && \
ninja && \
ninja install && \
cd .. && pip install . --no-deps; \
fi

COPY . /lightllm
RUN pip install -e /lightllm --no-cache-dir
84 changes: 0 additions & 84 deletions docker/Dockerfile.deepep

This file was deleted.

94 changes: 0 additions & 94 deletions docker/Dockerfile.nixl

This file was deleted.

Loading