Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #9136

Closed
wants to merge 3 commits into from
Closed

fix #9136

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
140 changes: 44 additions & 96 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.
name: "CICD NeMo"

on:
pull_request:
branches: [ "main" ]
types: [ labeled ]
on: pull_request
# :
# branches: [ "main" ]
# types: [ labeled ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand All @@ -25,7 +25,7 @@ concurrency:
jobs:
gpu-test:
runs-on: self-hosted-azure
if: ${{ github.event.label.name == 'Run CICD' }}
# if: ${{ github.event.label.name == 'Run CICD' }}
steps:
- name: Run nvidia-smi test
run: |
Expand All @@ -34,7 +34,7 @@ jobs:

cicd-cluster-clean:
runs-on: self-hosted-azure-builder
if: ${{ github.event.label.name == 'Run CICD' }}
# if: ${{ github.event.label.name == 'Run CICD' }}
steps:
- name: Clean server from old files
run: |
Expand All @@ -57,108 +57,56 @@ jobs:
cicd-test-container-setup:
needs: [cicd-cluster-clean]
runs-on: self-hosted-azure-builder
if: ${{ github.event.label.name == 'Run CICD' }}
# uses: actions/cache@v2
#container:
# image: nvcr.io/nvidia/pytorch:24.02-py3
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# if: ${{ github.event.label.name == 'Run CICD' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver: docker

- name: Container setup
- name: Build and push
uses: docker/build-push-action@v5
with:
file: Dockerfile.ci
push: true
# cache-from: nemoci.azurecr.io/nemo_container:latest
tags: |
nemoci.azurecr.io/nemo_container_${{ github.run_id }}
nemoci.azurecr.io/nemo_container:latest

- name: Run some checks
run: |
# Pull base PyTorch container
docker pull nvcr.io/nvidia/pytorch:24.02-py3
docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
set -x

# PyTorch version
python -c "import torch; print(torch.__version__)"
python -c "import torchvision; print(torchvision.__version__)"

# Install test requirements
apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt

# Code formatting checks
python setup.py style

# Copyright Headers check
python tests/check_copyright_header.py --dir .

# NeMo Installation
./reinstall.sh release

# Transformer Engine installation
git clone https://github.com/NVIDIA/TransformerEngine.git && \
pushd TransformerEngine && \
git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . && \
popd

# Apex installation
git clone https://github.com/NVIDIA/apex.git && \
pushd apex && \
git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
cp -R apex /usr/local/lib/python3.10/dist-packages && \
popd

# pip package should be working with main, if not we can update the commit here
# until the pip package is updated
# Megatron Core installation
git clone https://github.com/NVIDIA/Megatron-LM.git && \
pushd Megatron-LM && \
git checkout c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9 && \
pip install . && \
pushd megatron/core/datasets && \
make && \
popd && \
popd
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

# Install only for test: L2: Segmentation Tool
pushd tools/ctc_segmentation && \
pip install -r requirements.txt && \
apt-get update && apt-get install libsox-fmt-all -y && \
popd

# AMMO installation
pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir

# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"

# PyTorch Lightning DDP Checks
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"

# Basic Import Checks
python -c "import nemo.collections.asr as nemo_asr"
python -c "import nemo.collections.nlp as nemo_nlp"
python -c "import nemo.collections.tts as nemo_tts"

# set permission
chmod 777 -R /workspace
'
### \'\'
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)" && \

# PyTorch Lightning DDP Checks
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" && \

# Basic Import Checks
python -c "import nemo.collections.asr as nemo_asr" && \
python -c "import nemo.collections.nlp as nemo_nlp" && \
python -c "import nemo.collections.tts as nemo_tts"'
# && \

#python setup.py style && \
#python tests/check_copyright_header.py --dir .
'

- name: Push container to registry for future use
run: |
# Push container
echo "Docker: List containers" && docker ps -a
DOCKER_COMMIT=$(docker ps --latest --quiet) # latest container
docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# echo "Docker: List containers" && docker ps -a
# DOCKER_COMMIT=$(docker ps --latest --quiet) # latest container
# docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker push nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker push nemoci.azurecr.io/nemo_container

# - name: Build and push to local registry
# uses: docker/build-push-action@v5
Expand Down
76 changes: 76 additions & 0 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# syntax=docker/dockerfile:1-labs

# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3

FROM ${BASE_IMAGE}
RUN echo

ENV TRANSFORMERS_OFFLINE=0
ENV HYDRA_FULL_ERROR=1
ENV PYTHONUNBUFFERED=1
# APT packages
RUN <<"EOF" bash -ex
apt-get update
apt-get install -y bc libsox-fmt-all -y
apt-get clean
EOF

WORKDIR /workspace

# Install NeMo requirements
ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
ARG AMMO_VERSION=0.9.0
ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN --mount=type=bind,source=requirements,target=requirements --mount=type=bind,source=tools,target=tools <<"EOF" bash -ex
pip install --no-cache-dir pip-tools

tee requirements_existing.txt <<EOL
--extra-index-url https://pypi.nvidia.com
transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}
megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}
nvidia-ammo~=${AMMO_VERSION}
apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}
EOL

pip-compile -o requirements.lock --pip-args "--no-build-isolation" \
requirements_existing.txt \
requirements/requirements.txt \
requirements/requirements_test.txt \
requirements/requirements_lightning.txt \
requirements/requirements_common.txt \
requirements/requirements_asr.txt \
requirements/requirements_nlp.txt \
requirements/requirements_tts.txt \
requirements/requirements_slu.txt \
requirements/requirements_multimodal.txt \
tools/ctc_segmentation/requirements.txt

pip install --no-cache-dir --no-build-isolation -r requirements.lock
EOF

# Copy over NeMo code
COPY ./ ./
RUN <<"EOF" bash -ex
pip install .[all]
EOF

# set permission
RUN chmod 777 -R /workspace

ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

2 changes: 1 addition & 1 deletion reinstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ fi

echo 'Installing nemo'
if [[ "$INSTALL_OPTION" == "dev" ]]; then
${PIP} install --editable ".[all]"
${PIP} install --no-cache-dir --editable ".[all]"
else
rm -rf dist/
${PIP} install build pytest-runner
Expand Down