diff --git a/Dockerfile b/Dockerfile index 7d1539ad2db3..bc3fcaea0b0c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,19 +44,27 @@ RUN apt-get update && \ WORKDIR /workspace/ -WORKDIR /tmp/ +# Install megatron core, this can be removed once 0.3 pip package is released +# We leave it here in case we need to work off of a specific commit in main +RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ + cd Megatron-LM && \ + git checkout 375395c187ff64b8d56a1cd40572bc779864b1bd && \ + pip install . # Distributed Adam support for multiple dtypes RUN git clone https://github.com/NVIDIA/apex.git && \ cd apex && \ git checkout 52e18c894223800cb611682dce27d88050edf1de && \ - pip3 install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ + pip install install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ -# install megatron core, this can be removed once 0.3 pip package is released -RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \ - pip install -e . +RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \ + cd TransformerEngine && \ + git fetch origin a03f8bc9ae004e69aae4902fdd4a6d81fd95bc89 && \ + git checkout FETCH_HEAD && \ + git submodule init && git submodule update && \ + NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . + +WORKDIR /tmp/ # uninstall stuff from base container RUN pip3 uninstall -y sacrebleu torchtext diff --git a/README.rst b/README.rst index 210ee8929292..5c46d4528e39 100644 --- a/README.rst +++ b/README.rst @@ -248,7 +248,7 @@ To install Apex, run git clone https://github.com/NVIDIA/apex.git cd apex git checkout 52e18c894223800cb611682dce27d88050edf1de - pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ + pip install install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.