diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 3902163ef7..d6abd0948a 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -20,7 +20,6 @@ env: jobs: build: - runs-on: ubuntu-latest permissions: contents: read @@ -65,7 +64,6 @@ jobs: if: github.event_name != 'pull_request' uses: sigstore/cosign-installer@v3.5.0 - # Workaround: https://github.com/docker/build-push-action/issues/461 - name: Setup Docker buildx uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf @@ -88,9 +86,8 @@ jobs: with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # Build and push Docker image with Buildx (don't push on PR) - # https://github.com/docker/build-push-action - - name: Build and push Docker image + # Build and push default image (cuda12.8.0) + - name: Build and push Docker image (default cuda12.8.0) id: build-and-push uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a with: @@ -99,19 +96,11 @@ jobs: push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - # Build and push specific Docker image for deepep - # https://github.com/docker/build-push-action - - name: Build and push deepep Docker image - id: build-and-push-deepep - uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a - with: - context: . - file: ./docker/Dockerfile.deepep - push: ${{ github.event_name != 'pull_request' }} - tags: ghcr.io/modeltc/lightllm:main-deepep + build-args: | + CUDA_VERSION=12.8.0 + ENABLE_DEEPEP=1 + ENABLE_NIXL=1 + ENABLE_CACHE=1 cache-from: type=gha cache-to: type=gha,mode=max @@ -128,4 +117,4 @@ jobs: DIGEST: ${{ steps.build-and-push.outputs.digest }} # This step uses the identity token to provision an ephemeral certificate # against the sigstore community Fulcio instance. - run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} + run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} \ No newline at end of file diff --git a/build_and_upload_docker.sh b/build_and_upload_docker.sh index fc7fd871f7..9533b10f70 100755 --- a/build_and_upload_docker.sh +++ b/build_and_upload_docker.sh @@ -18,8 +18,4 @@ IMAGE_TAG=$2 ACCOUNT=$1 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG . -docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG - -#deepep -DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.deepep -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep . -docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep \ No newline at end of file +docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 6d67fcf4df..8f73a603cc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,18 +1,26 @@ -ARG CUDA_VERSION=12.6.1 +ARG CUDA_VERSION=12.8.0 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 + ARG PYTHON_VERSION=3.10 ARG MAMBA_VERSION=24.7.1-0 +ARG VLLM_VERSION=0.11.0 ARG TARGETPLATFORM +ARG ENABLE_DEEPEP=1 +ARG ENABLE_NIXL=1 +ARG ENABLE_CACHE=1 + ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ +RUN chmod 777 -R /tmp && \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + libssl-dev \ + curl \ + g++ \ + make \ + git && \ rm -rf /var/lib/apt/lists/* RUN case ${TARGETPLATFORM} in \ @@ -25,24 +33,103 @@ RUN case ${TARGETPLATFORM} in \ RUN case ${TARGETPLATFORM} in \ "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ + *) /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ esac && \ /opt/conda/bin/conda clean -ya - WORKDIR /root COPY ./requirements.txt /lightllm/requirements.txt RUN pip install -U pip RUN pip install -r /lightllm/requirements.txt --no-cache-dir +RUN pip install --no-cache-dir vllm==${VLLM_VERSION} +RUN pip install https://github.com/ModelTC/LightKernel/releases/download/v1.0.1/lightllm_kernel-0.1.0-cp310-cp310-linux_x86_64.whl + +RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/* + +ENV CUDA_HOME=/usr/local/cuda \ + GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ + +RUN if [ "${ENABLE_CACHE}" = "1" ]; then \ + apt-get update && apt-get install -y libboost-dev && rm -rf /var/lib/apt/lists/*; \ + LIGHTMEM_REF=5900baf92d85ef4dbda6124093506b0af906011a; \ + pip install --no-deps -v "git+https://github.com/ModelTC/LightMem.git@${LIGHTMEM_REF}#egg=light_mem"; \ + fi -RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \ + apt-get update && apt-get install -y wget devscripts debhelper dh-make build-essential dkms && \ + apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev && \ + rm -rf /var/lib/apt/lists/*; \ + mkdir -p /tmp/gdrcopy && cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ + && cd gdrcopy/packages \ + && CUDA=/usr/local/cuda ./build-deb-packages.sh \ + && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ + && cd / && rm -rf /tmp/gdrcopy; \ + fi -# TODO: offline compile -# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . +RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \ + set -e; \ + ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \ + NVSHMEM_VERSION=3.3.9; \ + CUDA_ARCHS=90; \ + wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \ + && tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \ + && cd nvshmem \ + && rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \ + && NVSHMEM_SHMEM_SUPPORT=0 \ + NVSHMEM_UCX_SUPPORT=0 \ + NVSHMEM_USE_NCCL=0 \ + NVSHMEM_MPI_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT=1 \ + NVSHMEM_PMIX_SUPPORT=0 \ + NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + NVSHMEM_USE_GDRCOPY=1 \ + cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \ + && cmake --build build --target install -j64; \ + DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \ + cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \ + cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \ + fi -RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel +RUN if [ "${ENABLE_NIXL}" = "1" ]; then \ + apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ + DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ + rm -rf /usr/lib/ucx && rm -rf /opt/hpcx/ucx && \ + cd /usr/local/src && \ + git clone https://github.com/openucx/ucx.git && \ + cd ucx && \ + git checkout v1.19.x && \ + ./autogen.sh && ./configure \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=/usr/local/cuda \ + --with-verbs=yes \ + --with-dm \ + --with-gdrcopy=/usr/local \ + --with-efa \ + --enable-mt && \ + make -j && \ + make -j install-strip && \ + ldconfig; \ + apt-get update && apt-get install -y pkg-config tmux net-tools && \ + cd /usr/local/src; \ + pip install --upgrade meson pybind11 patchelf; \ + git clone https://github.com/ai-dynamo/nixl.git -b main && \ + cd nixl && \ + rm -rf build && \ + mkdir build && \ + meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ + cd build && \ + ninja && \ + ninja install && \ + cd .. && pip install . --no-deps; \ + fi COPY . /lightllm RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep deleted file mode 100644 index e765978b91..0000000000 --- a/docker/Dockerfile.deepep +++ /dev/null @@ -1,84 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -U pip -RUN pip install -r /lightllm/requirements.txt --no-cache-dir - -RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly - -# TODO: offline compile -# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir \ No newline at end of file diff --git a/docker/Dockerfile.nixl b/docker/Dockerfile.nixl deleted file mode 100644 index b8047bbd03..0000000000 --- a/docker/Dockerfile.nixl +++ /dev/null @@ -1,94 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly -RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/Dockerfile.nixl.deepep b/docker/Dockerfile.nixl.deepep deleted file mode 100644 index 8ca06e1094..0000000000 --- a/docker/Dockerfile.nixl.deepep +++ /dev/null @@ -1,121 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 - -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM - -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly -RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.6.1/Dockerfile b/docker/cuda_version_12.6.1/Dockerfile deleted file mode 100644 index 6d67fcf4df..0000000000 --- a/docker/cuda_version_12.6.1/Dockerfile +++ /dev/null @@ -1,48 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -U pip -RUN pip install -r /lightllm/requirements.txt --no-cache-dir - -RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly - -# TODO: offline compile -# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.6.1/Dockerfile.deepep b/docker/cuda_version_12.6.1/Dockerfile.deepep deleted file mode 100644 index e765978b91..0000000000 --- a/docker/cuda_version_12.6.1/Dockerfile.deepep +++ /dev/null @@ -1,84 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -U pip -RUN pip install -r /lightllm/requirements.txt --no-cache-dir - -RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly - -# TODO: offline compile -# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir \ No newline at end of file diff --git a/docker/cuda_version_12.6.1/Dockerfile.nixl b/docker/cuda_version_12.6.1/Dockerfile.nixl deleted file mode 100644 index b8047bbd03..0000000000 --- a/docker/cuda_version_12.6.1/Dockerfile.nixl +++ /dev/null @@ -1,94 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly -RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.6.1/Dockerfile.nixl.deepep b/docker/cuda_version_12.6.1/Dockerfile.nixl.deepep deleted file mode 100644 index 8ca06e1094..0000000000 --- a/docker/cuda_version_12.6.1/Dockerfile.nixl.deepep +++ /dev/null @@ -1,121 +0,0 @@ -ARG CUDA_VERSION=12.6.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 - -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM - -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly -RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.8.0/Dockerfile b/docker/cuda_version_12.8.0/Dockerfile deleted file mode 100644 index 439031ce48..0000000000 --- a/docker/cuda_version_12.8.0/Dockerfile +++ /dev/null @@ -1,48 +0,0 @@ -ARG CUDA_VERSION=12.8.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -RUN pip install --no-cache-dir vllm==0.11.0 --pre --extra-index-url https://wheels.vllm.ai/nightly - -COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -U pip -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 - -# TODO: offline compile -# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.8.0/Dockerfile.deepep b/docker/cuda_version_12.8.0/Dockerfile.deepep deleted file mode 100644 index 99997fc5bd..0000000000 --- a/docker/cuda_version_12.8.0/Dockerfile.deepep +++ /dev/null @@ -1,83 +0,0 @@ -ARG CUDA_VERSION=12.8.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -RUN pip install --no-cache-dir vllm==0.11.0 --pre --extra-index-url https://wheels.vllm.ai/nightly - -COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -U pip -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 -# TODO: offline compile -# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . - -RUN apt-get update --allow-insecure-repositories && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir \ No newline at end of file diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl b/docker/cuda_version_12.8.0/Dockerfile.nixl deleted file mode 100644 index 4bcb66af56..0000000000 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl +++ /dev/null @@ -1,95 +0,0 @@ -ARG CUDA_VERSION=12.8.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm==0.11.0 --pre --extra-index-url https://wheels.vllm.ai/nightly - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 - -RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -v 'git+https://github.com/ModelTC/LightKernel.git@07f2f62af5deb41f10a22660f9f42dba9273361e#egg=lightllm_kernel' - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b 0.8.0 && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep deleted file mode 100644 index 96461dcc1b..0000000000 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep +++ /dev/null @@ -1,122 +0,0 @@ -ARG CUDA_VERSION=12.8.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 - -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM - -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm==0.11.0 --pre --extra-index-url https://wheels.vllm.ai/nightly - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 - -RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -v 'git+https://github.com/ModelTC/LightKernel.git@07f2f62af5deb41f10a22660f9f42dba9273361e#egg=lightllm_kernel' - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b 0.8.0 && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache deleted file mode 100644 index 2ff2dc3616..0000000000 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache +++ /dev/null @@ -1,124 +0,0 @@ -ARG CUDA_VERSION=12.8.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 - -ARG PYTHON_VERSION=3.10 -ARG MAMBA_VERSION=24.7.1-0 -ARG TARGETPLATFORM - -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -RUN chmod 777 -R /tmp && apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - libssl-dev \ - curl \ - g++ \ - make \ - git && \ - rm -rf /var/lib/apt/lists/* - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" && \ - /opt/conda/bin/conda install -y boost ;; \ - esac && \ - /opt/conda/bin/conda clean -ya - - -WORKDIR /root - -RUN --mount=type=cache,target=/root/.cache/pip pip install vllm==0.11.0 --pre --extra-index-url https://wheels.vllm.ai/nightly - -COPY ./requirements.txt /lightllm/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 - -RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -v 'git+https://github.com/ModelTC/LightKernel.git@07f2f62af5deb41f10a22660f9f42dba9273361e#egg=lightllm_kernel' -RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -v 'git+https://github.com/ModelTC/LightMem.git@5900baf92d85ef4dbda6124093506b0af906011a#egg=light_mem' - -RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms -RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev - -ENV CUDA_HOME=/usr/local/cuda \ - GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ - && cd gdrcopy/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy - - # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j64 - -ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 -RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. - -WORKDIR /root/DeepEP -ENV NVSHMEM_DIR=/root/nvshmem/install -RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install - -RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \ - DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \ - rm -rf /usr/lib/ucx && \ - rm -rf /opt/hpcx/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout v1.19.x && \ - ./autogen.sh && ./configure \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs=yes \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig; - -RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ - cd /usr/local/src; \ - pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b 0.8.0 && \ - cd nixl && \ - rm -rf build && \ - mkdir build && \ - meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \ - cd build && \ - ninja && \ - ninja install && \ - cd .. && pip install . --no-deps; - -COPY . /lightllm -RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/scripts/build.sh b/docker/scripts/build.sh new file mode 100644 index 0000000000..1699b39dd7 --- /dev/null +++ b/docker/scripts/build.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Notes: +# - All toggles can be configured via CLI flags or environment variables. +# - Default behavior matches the old build_default.sh: enable both DEEPEP and NIXL, and enable cache. +# +# Examples: +# ./docker/scripts/build.sh +# ./docker/scripts/build.sh --lite +# ./docker/scripts/build.sh --no-deepep --no-cache +# ./docker/scripts/build.sh --no-nixl +# ./docker/scripts/build.sh --cuda-version 12.4.1 --image-prefix myrepo/lightllm +# IMAGE_TAG=custom-cuda12 ./docker/scripts/build.sh +# +# Options: +# --no-deepep Disable DEEPEP (default: enabled) +# --no-nixl Disable NIXL (default: enabled) +# --no-cache Disable cache (default: enabled) +# --lite Disable DEEPEP, NIXL and cache in one shot +# --cuda-version CUDA version (default: 12.8.0) +# --image-prefix Image prefix (default: lightllm) +# --image-tag Image tag (default: generated from enabled features) +# -h / --help Show help + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +cd "${ROOT_DIR}" + +IMAGE_PREFIX="${IMAGE_PREFIX:-lightllm}" +CUDA_VERSION="${CUDA_VERSION:-12.8.0}" +IMAGE_TAG="${IMAGE_TAG:-}" + +ENABLE_DEEPEP="${ENABLE_DEEPEP:-1}" +ENABLE_NIXL="${ENABLE_NIXL:-1}" +ENABLE_CACHE="${ENABLE_CACHE:-1}" + +print_help() { + sed -n '1,80p' "$0" | sed 's/^# \{0,1\}//' +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --no-deepep) ENABLE_DEEPEP=0 ;; + --no-nixl) ENABLE_NIXL=0 ;; + --no-cache) ENABLE_CACHE=0 ;; + --lite) + ENABLE_DEEPEP=0 + ENABLE_NIXL=0 + ENABLE_CACHE=0 + ;; + --cuda-version) + CUDA_VERSION="${2:-}" + shift + ;; + --image-prefix) + IMAGE_PREFIX="${2:-}" + shift + ;; + --image-tag) + IMAGE_TAG="${2:-}" + shift + ;; + -h|--help) + print_help + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + print_help >&2 + exit 1 + ;; + esac + shift +done + +# Generate default image tag based on enabled features: +# - All on: cuda${CUDA_VERSION} (same as old build_default.sh) +# - Other combos: composed from enabled feature names +if [[ -z "${IMAGE_TAG}" ]]; then + tag_parts=() + if [[ "${ENABLE_NIXL}" -eq 1 ]]; then + tag_parts+=("nixl") + fi + if [[ "${ENABLE_DEEPEP}" -eq 1 ]]; then + tag_parts+=("deepep") + fi + if [[ "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then + IMAGE_TAG="cuda${CUDA_VERSION}" + else + prefix="" + if [[ ${#tag_parts[@]} -gt 0 ]]; then + prefix="$(IFS='.'; echo "${tag_parts[*]}")-" + fi + IMAGE_TAG="${prefix}cuda${CUDA_VERSION}" + fi +fi + +DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile \ + --build-arg CUDA_VERSION="${CUDA_VERSION}" \ + --build-arg ENABLE_DEEPEP="${ENABLE_DEEPEP}" \ + --build-arg ENABLE_NIXL="${ENABLE_NIXL}" \ + --build-arg ENABLE_CACHE="${ENABLE_CACHE}" \ + -t "${IMAGE_PREFIX}:${IMAGE_TAG}" . + diff --git a/docs/CN/source/getting_started/installation.rst b/docs/CN/source/getting_started/installation.rst index 5fa0e304d2..4a28cc6d1f 100755 --- a/docs/CN/source/getting_started/installation.rst +++ b/docs/CN/source/getting_started/installation.rst @@ -9,7 +9,7 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton ------------ * 操作系统: Linux -* Python: 3.9 +* Python: 3.10 * GPU: 计算能力 7.0 以上 (e.g., V100, T4, RTX20xx, A100, L4, H100, 等等.) .. _build_from_docker: @@ -41,7 +41,7 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton $ # 进入代码仓库的根目录 $ cd /lightllm - $ # 手动构建镜像, docker 目录下有不同功能场景的镜像构建文件,按需构建。 + $ # 手动构建镜像。 $ docker build -t -f ./docker/Dockerfile . $ $ # 运行 @@ -57,8 +57,7 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton $ python tools/quick_launch_docker.py --help .. note:: - 如果你使用多卡,你也许需要提高上面的 –shm_size 的参数设置。如果需要跑DeepSeek模型的EP模式,请使用镜像 - ghcr.io/modeltc/lightllm:main-deepep。 + 如果你使用多卡,你也许需要提高上面的 –shm_size 的参数设置。 .. _build_from_source: @@ -70,14 +69,14 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton .. code-block:: console $ # (推荐) 创建一个新的 conda 环境 - $ conda create -n lightllm python=3.9 -y + $ conda create -n lightllm python=3.10 -y $ conda activate lightllm $ $ # 下载lightllm的最新源码 $ git clone https://github.com/ModelTC/lightllm.git $ cd lightllm $ - $ # 安装lightllm的依赖 (cuda 12.4) + $ # 安装lightllm的依赖 (cuda 12.8) $ pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu124 $ $ # 安装lightllm的依赖 (摩尔线程 GPU) @@ -85,22 +84,4 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton $ pip install -r requirements-musa.txt $ $ # 安装lightllm - $ python setup.py install - -.. note:: - - Lightllm 的代码在多种GPU上都进行了测试,包括 V100, A100, A800, 4090, 和 H800。 - 如果你使用 A100 、A800 等显卡,那么推荐你安装 triton==3.0.0 : - - .. code-block:: console - - $ pip install triton==3.0.0 --no-deps - - 如果你使用 H800、V100 等显卡,那么推荐你安装 triton-nightly: - - .. code-block:: console - - $ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly --no-deps - - 具体原因可以参考:`issue `_ 和 `fix PR `_ - + $ python setup.py install \ No newline at end of file diff --git a/docs/EN/source/getting_started/installation.rst b/docs/EN/source/getting_started/installation.rst index 6439c48de3..e008ac14c9 100755 --- a/docs/EN/source/getting_started/installation.rst +++ b/docs/EN/source/getting_started/installation.rst @@ -9,7 +9,7 @@ Environment Requirements ------------------------ * Operating System: Linux -* Python: 3.9 +* Python: 3.10 * GPU: Compute Capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) .. _build_from_docker: @@ -61,10 +61,7 @@ Or you can directly use the script to launch the image and run it with one click $ python tools/quick_launch_docker.py --help .. note:: - If you use multiple GPUs, you may need to increase the --shm-size parameter setting above. If you need to run DeepSeek models in EP mode, please use the image - ghcr.io/modeltc/lightllm:main-deepep. - -.. _build_from_source: + If you use multiple GPUs, you may need to increase the --shm-size parameter setting above. Installation from Source ------------------------ @@ -74,14 +71,14 @@ You can also install Lightllm from source: .. code-block:: console $ # (Recommended) Create a new conda environment - $ conda create -n lightllm python=3.9 -y + $ conda create -n lightllm python=3.10 -y $ conda activate lightllm $ $ # Download the latest Lightllm source code $ git clone https://github.com/ModelTC/lightllm.git $ cd lightllm $ - $ # Install Lightllm dependencies (cuda 12.4) + $ # Install Lightllm dependencies (cuda 12.8) $ pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu124 $ $ # Install Lightllm dependencies (Moore Threads GPU) @@ -89,21 +86,4 @@ You can also install Lightllm from source: $ pip install -r requirements-musa.txt $ $ # Install Lightllm - $ python setup.py install - -.. note:: - - Lightllm code has been tested on various GPUs including V100, A100, A800, 4090, and H800. - If you use A100, A800 and other graphics cards, it is recommended to install triton==3.0.0: - - .. code-block:: console - - $ pip install triton==3.0.0 --no-deps - - If you use H800, V100 and other graphics cards, it is recommended to install triton-nightly: - - .. code-block:: console - - $ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly --no-deps - - For specific reasons, please refer to: `issue `_ and `fix PR `_ \ No newline at end of file + $ python setup.py install \ No newline at end of file