From af21b7765882d9a694f2c0c5e1cb7d87d98d733e Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Thu, 7 Mar 2024 12:45:48 -0800 Subject: [PATCH 001/140] Accelerate CTC greedy decoding by around 10% (#8521) * Accelerate CTC greedy decoding by over 10x. There were a few problems with CTC greedy decoding before: - It would copy GPU memory to pageable memory originally, which is very slow compared to copying to pinned memory. Unfortunately, cudaMallocHost() is synchronous and "slow", but fortunately pytorch has a free list of recent pinned memory allocations in its pinned memory allocator, allowing us to reduce the number of these calls. - A single scalar of logits_len/decoder_lengths was copied from GPU to CPU at a time, rather than all at once in a single call. This caused neddless overhead. We could also use a pinned memory allocation for copying logits_len as well, but using pinned memory is less important for small allocations. - detatch() was called in function marked with @nograd - For some reason, someone was using torch_tensor.numpy().tolist() instead of torch_tensor.tolist(). I don't believe numpy() makes a copy of the data, but it is unnecessary. The main important improvements are the first two bullet points. Everything else is unimportant. There are more opportunities for improvement. In particular, logits gets copied to cpu twice if trcfg.return_hypotheses is True Performance improvements: I ran this code on an A100 GPU, on a machine with 16 CPU cores: ``` python examples/asr/speech_to_text_eval.py \ pretrained_name=nvidia/parakeet-ctc-1.1b \ dataset_manifest=/home/dgalvez/scratch/data/test_other.json \ batch_size=16 output_filename=test_other_decoded.jsonl amp=true \ amp_dtype=bfloat16 use_cer=false num_workers=1 ``` Time to do each of 5 evaluations of librispeech test other before my changes: 33 seconds 29 29 30 28 Average: 29.8 seconds Average excluding first (warmup): 29 seconds Time to do each of 5 evaluations of librispeech test other after my changes: 35 seconds 28 29 26 26 Average: 28.8 seconds Average excluding first (warmup): 27.25 seconds This corresponds to an almost 10% speedup. This meets expectations, since 10% of the time was originally spent on CTC greedy decoding before. You may wonder why the first iteration is slower. It is because calling cudaMallocHost(), and then doing a pinned memory copy from GPU to CPU is slower than doing a paged memory copy from GPU to CPU. However, the cudaMallocHost() calls will be cached over time, allowing us to avoid the overhead of them in later evaltions of the dataset. Signed-off-by: Daniel Galvez * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Make this work in CPU-only mode. Signed-off-by: Daniel Galvez * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Daniel Galvez Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/asr/models/ctc_models.py | 16 +++++++--- .../parts/submodules/ctc_greedy_decoding.py | 32 +++++++++++++++---- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 42406415651c..5f380619db68 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -662,14 +662,22 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor( logits, decoder_lengths=logits_len, return_hypotheses=trcfg.return_hypotheses, ) - logits = logits.cpu() - if trcfg.return_hypotheses: + if logits.is_cuda: + # See comment in + # ctc_greedy_decoding.py::GreedyCTCInfer::forward() to + # understand this idiom. + logits_cpu = torch.empty(logits.shape, dtype=logits.dtype, device=torch.device("cpu"), pin_memory=True) + logits_cpu.copy_(logits, non_blocking=True) + else: + logits_cpu = logits + logits_len = logits_len.cpu() # dump log probs per file - for idx in range(logits.shape[0]): - current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] + for idx in range(logits_cpu.shape[0]): + current_hypotheses[idx].y_sequence = logits_cpu[idx][: logits_len[idx]] if current_hypotheses[idx].alignments is None: current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence + del logits_cpu # cleanup memory del logits, logits_len diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index 686ef79cabad..ab4b4c40e860 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -161,7 +161,25 @@ def forward( with torch.inference_mode(): hypotheses = [] # Process each sequence independently - prediction_cpu_tensor = decoder_output.cpu() + + if decoder_output.is_cuda: + # This two-liner is around twenty times faster than: + # `prediction_cpu_tensor = decoder_output.cpu()` + # cpu() does not use pinned memory, meaning that a slow pageable + # copy must be done instead. + prediction_cpu_tensor = torch.empty( + decoder_output.shape, dtype=decoder_output.dtype, device=torch.device("cpu"), pin_memory=True + ) + prediction_cpu_tensor.copy_(decoder_output, non_blocking=True) + else: + prediction_cpu_tensor = decoder_output + + if decoder_lengths is not None and isinstance(decoder_lengths, torch.Tensor): + # Before this change, self._greedy_decode_labels would copy + # each scalar from GPU to CPU one at a time, in the line: + # prediction = prediction[:out_len] + # Doing one GPU to CPU copy ahead of time amortizes that overhead. + decoder_lengths = decoder_lengths.cpu() if prediction_cpu_tensor.ndim < 2 or prediction_cpu_tensor.ndim > 3: raise ValueError( @@ -192,7 +210,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor): # Initialize blank state and empty label set in Hypothesis hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - prediction = x.detach().cpu() + prediction = x.cpu() if out_len is not None: prediction = prediction[:out_len] @@ -200,7 +218,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor): prediction_logprobs, prediction_labels = prediction.max(dim=-1) non_blank_ids = prediction_labels != self.blank_id - hypothesis.y_sequence = prediction_labels.numpy().tolist() + hypothesis.y_sequence = prediction_labels.tolist() hypothesis.score = (prediction_logprobs[non_blank_ids]).sum() if self.preserve_alignments: @@ -208,7 +226,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor): hypothesis.alignments = (prediction.clone(), prediction_labels.clone()) if self.compute_timestamps: - hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].numpy().tolist() + hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist() if self.preserve_frame_confidence: hypothesis.frame_confidence = self._get_confidence(prediction) @@ -222,20 +240,20 @@ def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor): # Initialize blank state and empty label set in Hypothesis hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - prediction_labels = x.detach().cpu() + prediction_labels = x.cpu() if out_len is not None: prediction_labels = prediction_labels[:out_len] non_blank_ids = prediction_labels != self.blank_id - hypothesis.y_sequence = prediction_labels.numpy().tolist() + hypothesis.y_sequence = prediction_labels.tolist() hypothesis.score = -1.0 if self.preserve_alignments: raise ValueError("Requested for alignments, but predictions provided were labels, not log probabilities.") if self.compute_timestamps: - hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].numpy().tolist() + hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist() if self.preserve_frame_confidence: raise ValueError( From aad6cf7789c8b54c4e6cc22bd1ff9141e566a61e Mon Sep 17 00:00:00 2001 From: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com> Date: Thu, 7 Mar 2024 15:42:57 -0800 Subject: [PATCH 002/140] bug fix in transcribe_speech.py (#8611) Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada --- examples/asr/transcribe_speech.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index b8a10f01603f..6d6006e939e5 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -396,8 +396,6 @@ def autocast(dtype=None): logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}") if cfg.presort_manifest: transcriptions = restore_transcription_order(cfg.dataset_manifest, transcriptions) - if remove_path_after_done is not None: - os.unlink(remove_path_after_done) else: logging.info(f"Finished transcribing {len(filepaths)} files !") logging.info(f"Writing transcriptions into file: {cfg.output_filename}") @@ -420,6 +418,11 @@ def autocast(dtype=None): ) logging.info(f"Finished writing predictions to {output_filename}!") + # clean-up + if cfg.presort_manifest is not None: + if remove_path_after_done is not None: + os.unlink(remove_path_after_done) + if cfg.calculate_wer: output_manifest_w_wer, total_res, _ = cal_write_wer( pred_manifest=output_filename, From 8f3855f241099a83b405d2057998d628789ec73b Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Thu, 7 Mar 2024 16:34:42 -0800 Subject: [PATCH 003/140] Remove irrelevant multimodal models in docs (#8574) * Remove irrelevant models in docs Signed-off-by: yaoyu-33 * Fix citation Signed-off-by: yaoyu-33 * clean up mm intro page Signed-off-by: yaoyu-33 * Update docs/source/multimodal/nerf/intro.rst Signed-off-by: Eric Harper --------- Signed-off-by: yaoyu-33 Signed-off-by: Eric Harper Co-authored-by: Eric Harper --- docs/source/multimodal/mllm/intro.rst | 100 +--------------------- docs/source/multimodal/nerf/intro.rst | 47 +--------- docs/source/multimodal/text2img/intro.rst | 86 +------------------ docs/source/multimodal/vlm/intro.rst | 72 +--------------- 4 files changed, 4 insertions(+), 301 deletions(-) diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst index 4a87ac44e7c5..687ecd930a9e 100644 --- a/docs/source/multimodal/mllm/intro.rst +++ b/docs/source/multimodal/mllm/intro.rst @@ -1,97 +1,7 @@ Multimodal Language Models ========================== -The endeavor to extend Language Models (LLMs) into multimodal domains by integrating additional structures like visual encoders has become a focal point of recent research, especially given its potential to significantly lower the cost compared to training multimodal universal models from scratch. - -The advent of GPT-4 has spurred a plethora of developments including notable models like LLaVA, Mini-GPT4, and Flamingo. These models, despite minor differences, share similar structural and training strategies. - -Supported Models ------------------ -NeMo Multimodal currently supports the following models: - -+-----------------------------------+----------+-------------+------+-------------------------+------------------+ -| Model | Training | Fine-Tuning | PEFT | Evaluation | Inference | -+===================================+==========+=============+======+=========================+==================+ -| `NeVA (LLaVA) <./neva.html>`_ | Yes | Yes | - | - | Yes | -+-----------------------------------+----------+-------------+------+-------------------------+------------------+ -| Kosmos-2 | WIP | WIP | - | - | WIP | -+-----------------------------------+----------+-------------+------+-------------------------+------------------+ - -Spotlight Models ------------------ - -LLaVA: Visual Instruction Tuning -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -LLaVA :cite:`mm-models-llava` focuses on creating a dataset for visual instruction tuning to enhance LLMs' ability to comprehend diverse instructions and provide detailed responses. NeMo's implementation of LLaVA is called NeVA. - -- Model Structure: - - Visual Encoder: Utilizes CLIP’s ViT-L/14. - - Text Decoder: Employs LLaMA. - - Connection: A simple linear mapping layer connects the visual encoder's output to the text decoder's word embedding space (v1.0 version). - -- Training: - 1. Cross-modal Pre-training: Utilizes 595k image-text data from CC3M, training only the linear mapping layer while keeping the visual encoder and text decoder frozen. - 2. Instruction Fine-tuning: Custom-built 158k multimodal instruction dataset employed for fine-tuning targeting multimodal chatbot scenarios, with a variant targeting the Science QA dataset. - -Flamingo: A Visual Language Model for Few-Shot Learning -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Flamingo :cite:`mm-models-flamingo` addresses inconsistent visual feature map sizes by generating fixed-length feature sequences, enhancing visual relevance generation. - -- Model Structure: - - Resampler: Utilizes a Perceiver Resampler for generating fixed-length feature sequences. - - Attention: Adds cross-attention layers before each LLM layer to enhance visual relevance generation. - -- Training: - - Dataset: Utilizes data from various datasets like M3W, ALIGN, LTIP, and VTP emphasizing multimodal in-context learning. - -Kosmos-1: Language Is Not All You Need: Aligning Perception with Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Kosmos-1 :cite:`mm-models-kosmos1` by Microsoft is a Multimodal Large Language Model (MLLM) aimed at melding language, perception, action, and world modeling. - -- Model Structure: - - Core Backbone: Transformer-Based Causal Language Model. - - Architecture: Utilizes MAGNETO, a nuanced Transformer variant. - - Position Encoding: Employs XPOS relative position encoding for long-context modeling. - - Resampler: Employs Flamingo's Perceiver Resampler - -- Training: - - Dataset: Encompasses web-scale multimodal corpora including monomodal, cross-modal paired, and interleaved multimodal data. - - Objective: Focused on next-token prediction to maximize log-likelihood of tokens within examples. - -BLIP-2: Bootstrapping Language-Image Pre-training -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -BLIP-2 :cite:`mm-models-blip2` adopts a two-phase training strategy focusing on learning key visual information and adapting visual encoding structure to LLMs. - -- Model Structure: - - Visual Encoder: Combines a pre-trained image encoder with a Querying Transformer (Q-Former). - - Bridging: The Q-Former acts as the bridge between the image encoder and the Large Language Model (LLM). - -- Training: - 1. Phase 1: Focuses on tasks like Image-Text Contrastive Learning, Image-grounded Text Generation, and Image-Text Matching. - 2. Phase 2: Aims at adapting the visual encoding structure's output to LLMs with language modeling as the training task. - -Mini-GPT4: Enhancing Vision-Language Understanding -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Mini-GPT4 :cite:`mm-models-minigpt4` emphasizes the importance of multimodal instruction data for model performance in multimodal open-ended scenarios. - -- Model Structure: - - Visual Encoder: Employs BLIP2’s ViT and Q-Former. - - Text Decoder: Uses Vicuna (a fine-tuned version of LLaMA). - - Connection: A linear mapping layer projects visual features into text representation space. - -- Training: - 1. Cross-modal Learning: Focuses on learning the relationship between vision and language using data from CC+SBU+LAION datasets. - 2. Fine-tuning: Utilizes a multimodal fine-tuning dataset built using ChatGPT to enhance text descriptions generated in phase 1. - -.. note:: - NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here `_ . - -For more information, see additional sections in the NeMo multimodal language model docs on the left-hand-side menu or in the list below: +The endeavor to extend Language Models (LLMs) into multimodal domains by integrating additional structures like visual encoders has become a focal point of recent research, especially given its potential to significantly lower the cost compared to training multimodal universal models from scratch. Please refer to `NeMo Framework User Guide for Multimodal Models `_ for detailed support information. .. toctree:: :maxdepth: 1 @@ -101,11 +11,3 @@ For more information, see additional sections in the NeMo multimodal language mo checkpoint neva -References ----------- - -.. bibliography:: ../mm_all.bib - :style: plain - :filter: docname in docnames - :labelprefix: MM-MODELS - :keyprefix: mm-models- diff --git a/docs/source/multimodal/nerf/intro.rst b/docs/source/multimodal/nerf/intro.rst index eca057215a75..1380fe65a54d 100644 --- a/docs/source/multimodal/nerf/intro.rst +++ b/docs/source/multimodal/nerf/intro.rst @@ -1,42 +1,6 @@ NeRF ==== -NeMO NeRF is a collection of models and tools for training 3D and 4D models. - -The library is designed with a modular approach, enabling developers to explore and find the most suitable solutions for their requirements, -and allowing researchers to accelerate their experimentation process. - - -Supported Models ------------------ -NeMo NeRF currently supports the following models: - -+----------------------------------------+------------+ -| Model | Categories | -+========================================+============+ -| `DreamFusion <./dreamfusion.html>`_ | text to 3D | -+----------------------------------------+------------+ - - -Spotlight Models ------------------ - -DreamFusion -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The `DreamFusion `_ model utilizing pre-trained 2D text-to-image diffusion models to create detailed 3D objects from textual descriptions. -This approach overcomes the limitations of traditional 3D synthesis, which typically requires extensive labeled 3D data and sophisticated denoising architectures. -At the core of DreamFusion is the optimization of a Neural Radiance Field (NeRF), a parametric model for rendering 3D scenes. -The optimization process is driven by a loss function based on probability density distillation, which enables the 2D diffusion model to act as an effective prior. -DreamFusion is capable of producing 3D models that are not only accurate representations of the input text but also offer versatility in terms of rendering from any viewpoint, -relighting under diverse lighting conditions, and integration into various 3D environments. Importantly, this method achieves these results without the need for -specific 3D training data or modifications to the existing image diffusion model. - -- Model Structure: - - Text-to-image model: a pretrained text-to-image diffusion model is used to generate a 2D image from a given text. - - NeRF: a neural radiance field (NeRF) that can generate novel views of complex 3D scenes, based on a partial set of 2D images. - - Renderer: A volume rendering layer is used to render the NeRF model from a given viewpoint. - - -For more information, see additional sections in the NeRF docs on the left-hand-side menu or in the list below: +NeMo NeRF is a collection of models and tools for training 3D and 4D models. Please refer to `NeMo Framework User Guide for Multimodal Models `_ for detailed support information. .. toctree:: :maxdepth: 1 @@ -44,12 +8,3 @@ For more information, see additional sections in the NeRF docs on the left-hand- datasets configs dreamfusion - -References ----------- - -.. bibliography:: ../mm_all.bib - :style: plain - :filter: docname in docnames - :labelprefix: MM-MODELS - :keyprefix: mm-models- diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst index 39ce33562d50..9ec793d246fa 100644 --- a/docs/source/multimodal/text2img/intro.rst +++ b/docs/source/multimodal/text2img/intro.rst @@ -1,82 +1,7 @@ Text to Image Models ==================== - -Supported Models ------------------ -NeMo Multimodal currently supports the following models: - -+----------------------------------------+------------+ -| Model | Categories | -+========================================+============+ -| `Stable Diffusion <./sd.html>`_ | Foundation | -+----------------------------------------+------------+ -| `Imagen <./imagen.html>`_ | Foundation | -+----------------------------------------+------------+ -| `DreamBooth <./dreambooth.html>`_ | Finetune | -+----------------------------------------+------------+ -| `ControlNet <./controlnet.html>`_ | Finetune | -+----------------------------------------+------------+ -| `instructPix2Pix <./insp2p.html>`_ | Finetune | -+----------------------------------------+------------+ - - -Text2Img Foundation Models --------------------------- -Text-to-image models are a fascinating category of artificial intelligence models that aim to generate realistic images from textual descriptions. The mainstream text-2-image models can be broadly grouped into: - -#. **Diffusion Based Models**: these models leverage diffusion processes to - generate images from text and may operate in the latent space (Stable Diffusion :cite:`mm-models-rombach2022highresolution`) or directly in the pixel space (Imagen :cite:`mm-models-saharia2022photorealistic`). These models typically use probabilistic models to model the generation process. - They consider the sequential diffusion of information, which helps them generate images in a more coherent and controlled manner. - This approach is known for producing high-quality and diverse images while incorporating textual descriptions. - -#. **Autoregressive Based Models**: like Parti :cite:`mm-models-yu2022scaling` - and Make-A-Scene :cite:`mm-models-gafni2022makeascene`, generate images one pixel or region at a time. - These models take in the text description and gradually build the image pixel by pixel or element by element in - an autoregressive manner. While this approach can produce detailed images, it can be computationally expensive - and may not scale well for high-resolution images. - - -#. **Masked Token Prediction Models**: including MUSE :cite:`mm-models-chang2023muse`, employ masked token prediction-based architectures. - These models learn to map text and image inputs into a shared embedding space. - They use a masked token prediction task during pretraining, allowing them to understand the - relationships between text and images. Given a text prompt, they can retrieve or generate images - that align with the content and context of the text description. - - -Each of these approaches has its strengths and weaknesses, making them suitable for different use cases and scenarios. -Diffusion-based models excel in generating diverse and high-quality images, autoregressive models offer fine-grained control, -and masked token prediction-based models are strong at understanding and aligning text and images. -The choice of model depends on the specific requirements of the text-to-image generation task at hand. - - -Approaches to Customize/Extend Text2Img Models ----------------------------------------------- - -Customizing and extending Text2Img models can be essential to tailor these foundation models to -specific applications or creative tasks. Some popular approaches to customize and extend text2img models include: - - -#. **Text-Based Image Editing**: such as instructPix2Pix :cite:`mm-models-insp2p`, involves manipulating or modifying generated images based on - textual descriptions. To customize text2img models for this purpose, one can employ post-processing techniques to - alter the generated images. - -#. **Injecting New Concepts**: including DreamBooth :cite:`mm-models-ruiz2023dreambooth`, can introduce new concepts into text2img models. This is typically done by - adapting foundation models with additional data for finetuning. - -#. **Adding Conditionings to Guide Image Generation**: like ControlNet :cite:`mm-models-zhang2023adding`, allows for greater control and specificity in the generated images. - These conditionings can be based on various factors including specific attributes mentioned in the text (such as colors, sizes, or object properties), - spatial information, style and mood. - -Customizing and extending Text2Img models based on these approaches empowers users to have more control over the generated content, -make images more contextually relevant, and adapt the models to a wide array of creative and practical tasks, -from art creation to content personalization. - -.. note:: - NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here `_ . - - -For more information, see additional sections in the MM Text2Img docs on the left-hand-side menu or in the list below: +NeMo multimodal provides implementations of multiple image-to-text models, including Stable Diffusion, Imagen, DreamBooth, ControlNet, and InstructPix2Pix. Please refer to `NeMo Framework User Guide for Multimodal Models `_ for detailed support information. .. toctree:: :maxdepth: 1 @@ -88,12 +13,3 @@ For more information, see additional sections in the MM Text2Img docs on the lef imagen dreambooth controlnet - -References ----------- - -.. bibliography:: ../mm_all.bib - :style: plain - :filter: docname in docnames - :labelprefix: MM-MODELS - :keyprefix: mm-models- \ No newline at end of file diff --git a/docs/source/multimodal/vlm/intro.rst b/docs/source/multimodal/vlm/intro.rst index 949fb8a11196..2885b27e24a4 100644 --- a/docs/source/multimodal/vlm/intro.rst +++ b/docs/source/multimodal/vlm/intro.rst @@ -1,68 +1,7 @@ Vision-Language Foundation ========================== -Humans naturally process information using multiple senses like sight and sound. Similarly, multi-modal learning aims to create models that handle different types of data, such as images, text, and audio. There's a growing trend in models that combine vision and language, like OpenAI's CLIP. These models excel in tasks like aligning image and text features, image captioning and visual question-answering. Their ability to generalize without specific training offers many practical uses. - -Supported Models ------------------ -NeMo Multimodal currently supports the following models: - -+-----------------------------------+----------+-------------+------+-------------------------+------------------+ -| Model | Training | Fine-Tuning | PEFT | Evaluation | Inference | -+===================================+==========+=============+======+=========================+==================+ -| `CLIP <./clip.html>`_ | ✓ | - | - | zero-shot imagenet | similarity score | -+-----------------------------------+----------+-------------+------+-------------------------+------------------+ - -Spotlight Models ------------------ - -Vision-Language models are at the forefront of multimodal learning, showcasing impressive abilities in tasks that require a combination of visual and textual comprehension. Let's take a quick look at some key models driving progress in this field: - -#. **Contrastive Learning Based Models**: At the forefront is CLIP :cite:`mm-models-radford2021clip`, which harnesses contrastive learning to jointly fine-tune a text and image encoder, facilitating a gamut of downstream tasks. CLIP's success has spurred further research, leading to models like ALIGN :cite:`mm-models-saharia2022photorealistic` and DeCLIP :cite:`mm-models-li2021declip`. - -#. **Holistic Foundation Models**: FLAVA :cite:`mm-models-singh2022flava` aspires to craft a universal model adept at vision, language, and multimodal tasks. Through a unified architecture, it vies to excel across a spectrum of tasks, embodying the essence of a true foundation model. - -#. **Bootstrapping Techniques**: BLIP :cite:`mm-models-blip2` employs a pioneering framework that shines in both understanding-based and generation-based vision-language tasks. By bootstrapping captions from noisy web data, it exhibits remarkable generalization across a plethora of vision-language challenges. - -Anatomy of Vision-Language Models ----------------------------------- - -At their core, vision-language models fundamentally consist of three main parts: - -1. **Image Encoder:** Extracts features from images. -2. **Text Encoder:** Extracts features from textual data. -3. **Fusion Strategy:** Merges the information gleaned from both encoders. - -These models have undergone a significant transformation. Earlier models used manually designed image descriptors and pre-trained word vectors. Nowadays, models primarily utilize transformer architectures for both image and text encoding, learning features together or separately. The pre-training objectives of these models are carefully designed to suit a wide range of tasks. - -Contrastive Learning: Bridging Vision and Language ---------------------------------------------------- - -Contrastive learning has burgeoned as a pivotal pre-training objective, especially for vision-language models. Models like CLIP, CLOOB, ALIGN, and DeCLIP have harnessed contrastive learning to bridge the chasm between vision and language. They accomplish this by jointly learning a text encoder and an image encoder using a contrastive loss, typically on extensive datasets encompassing {image, caption} pairs. - -The quintessence of contrastive learning is to map images and texts to a shared feature realm. Here, the distance between the embeddings of congruent image-text pairs is minimized, while it's maximized for incongruent pairs. For instance, CLIP employs the cosine distance between text and image embeddings, while models like ALIGN and DeCLIP have crafted their own distance metrics to cater to the intricacies of their datasets. - -CLIP and Beyond ---------------- - -The CLIP (Contrastive Language-Image Pre-training) model has notably served as a linchpin for various models and applications within the realms of deep learning and computer vision, and also within the NeMo toolkit. Below is an elucidation on how the CLIP model extends its influence into other models and domains: - -1. **Use Cases in Vision Tasks:** - * **Classification:** CLIP can be harnessed for classification tasks, accepting arbitrary text labels for zero-shot classification on video frames or images. - * **Semantic Image Search:** Constructing a semantic image search engine with CLIP showcases its capability to generate embeddings for semantic content analysis and similarity search. - -2. **Image Similarity and Clustering:** - * In a practical scenario, CLIP's embeddings were leveraged for an image similarity search engine, showcasing its effectiveness in generating useful representations for visual similarity scenarios, even without being specifically trained for such tasks. - -3. **Foundation for Multimodal Language Models:** - * Large language models with visual capabilities, such as LLaVA, Flamingo, Kosmos-1, and Kosmos-2, have leaned on CLIP's architecture. In these models, images are encoded using a visual encoder derived from CLIP. - -4. **Foundation Diffusion Models:** - * Models like Stable Diffusion and Imagen have tapped into the prowess of the text encoder from CLIP to condition their processes based on text prompts. This integration exemplifies the adaptability and influence of the CLIP encoder in the broader AI landscape, especially in the domain of diffusion models. - -.. note:: - NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here `_ . - +Humans naturally process information using multiple senses like sight and sound. Similarly, multimodal learning aims to create models that handle different data types, such as images, text, and audio. There's a growing trend in models that combine vision and language, like OpenAI's CLIP. These models excel at tasks like aligning image and text features, image captioning, and visual question-answering. Their ability to generalize without specific training offers many practical uses. Please refer to `NeMo Framework User Guide for Multimodal Models `_ for detailed support information. .. toctree:: :maxdepth: 1 @@ -71,12 +10,3 @@ The CLIP (Contrastive Language-Image Pre-training) model has notably served as configs checkpoint clip - -References ----------- - -.. bibliography:: ../mm_all.bib - :style: plain - :filter: docname in docnames - :labelprefix: MM-MODELS - :keyprefix: mm-models- \ No newline at end of file From 60af0825aa5a067353e97a1a7877d78b96fa1400 Mon Sep 17 00:00:00 2001 From: Gerald Shen <119401249+gshennvm@users.noreply.github.com> Date: Thu, 7 Mar 2024 17:20:43 -0800 Subject: [PATCH 004/140] run val only if val dataloader exists (#8605) Signed-off-by: Gerald Shen Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> --- .../nlp/models/language_modeling/megatron_base_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 15df152b79c2..cd5587351ecd 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -333,8 +333,8 @@ def _reconfigure_val_batches(self): self.trainer.limit_val_batches *= get_num_microbatches() else: assert isinstance(self.trainer.limit_val_batches, float) - # Don't reconfigure if limit_val_batches is 0.0 - if self.trainer.limit_val_batches == 0.0: + # Don't reconfigure if limit_val_batches is 0.0 or if there's no val dataloader + if self.trainer.limit_val_batches == 0.0 or self._validation_dl is None: return # len(self._validation_dl) returns len as num of microbatches val_len_in_micro_batches = len(self._validation_dl) From 593e6621cb89a681890dfdeaa88d25a724ddc5c0 Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Fri, 8 Mar 2024 11:26:28 -0800 Subject: [PATCH 005/140] Upgrade to PTL 2.1 and 2.2 (#8030) * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * Update PTL version in requirements Signed-off-by: Abhishree * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused import and comment val_iterator_done Signed-off-by: Abhishree * Override _link_checkpoint Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Temporarily comment out CPU Unit tests Signed-off-by: Abhishree * Remove precision arg from Trainer in convert_hf_llama_to_nemo.py Signed-off-by: Abhishree * Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py Signed-off-by: Abhishree * Temporarily disable NMT Training TP=2 test Signed-off-by: Abhishree * Fix val_step, test_step func API MegatronLMEncoderDecoderModel Signed-off-by: Abhishree * Enable NMT training TP=2 test Signed-off-by: Abhishree * Disable some unit tests Signed-off-by: Abhishree * Comment CI tests Signed-off-by: Abhishree * Comment resume part of BART Signed-off-by: Abhishree * Uncomment few lines from JenkinsFile Signed-off-by: Abhishree * Return len of dataloader in microbatches Signed-off-by: Abhishree * Fix _link_checkpoint 1) Add inject_model_parallel_rank to _link_checkpoint 2) Override super._link_checkpoint to remove condition check for rank 0 Signed-off-by: Abhishree * Check if using dist ckpt in _link_checkpoint Signed-off-by: Abhishree * Remove batch_idx arg from validation_step megatron_gpt_sft_model.py Signed-off-by: Abhishree * Use PTL bug fix branch Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files Signed-off-by: Abhishree * Temporarily disable test_ema_saved_state in test_ema.py Signed-off-by: Abhishree * Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py Signed-off-by: Abhishree * Use PTL with fs.lexists Signed-off-by: Abhishree * Comment _link_checkpoint related overrides In order to test with PTL without symbolic links Signed-off-by: Abhishree * Return only batch for dataloader_iter in DFT model Signed-off-by: Abhishree * Modify get_batch in GPTModel Signed-off-by: Abhishree * Add condition checks for batch extraction from dataloader_iter Signed-off-by: Abhishree * Add missing condition check for batch extraction in GPTModel Signed-off-by: Abhishree * Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder Signed-off-by: Abhishree * Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py Signed-off-by: Abhishree * Fix test invalid ckpts in test_exp_manager.py Also uncomment some of the commented out tests in JenkinsFile and test_ema.py Signed-off-by: Abhishree * Fix bug in test_invalid_checkpoints_removed_from_topk Signed-off-by: Abhishree * Fix validation step of GPTModel for finetuning case with multi dataloaders Signed-off-by: Abhishree * Fix test_step_outputs for SFT in GPTMOdel Signed-off-by: Abhishree * Pass dataloader_idx for val_step of GPTModel and remove unwanted code 1) Pass dataloader_idx to val_step of GPTModel as its required for GPTSFTModel in case multi dataloaders to append the outputs correctly val/test_step_output 2) Remove val_iterator_done check from all megatron GPT models Signed-off-by: Abhishree * Add condition check for extraction of batch in T5SFTModel & LMEncoderDecoder Signed-off-by: Abhishree * Add condition check for extracting batch in MegatronNMTModel Also uncomment GPT PP=2 and NMT tests from JenkinsFIle Signed-off-by: Abhishree * Fix typo and uncomment multimodel tests Signed-off-by: Abhishree * Change to new dataloader_iter API for MultiModal Signed-off-by: Abhishree * Fix new dataloader_api for MegatronLatenDiffusion Model Signed-off-by: Abhishree * Store and restore precision value in MegatronGPTSFTModel Signed-off-by: Abhishree * Temporarily comment Multimodal Stable Diffusion Train Signed-off-by: Abhishree * Update JenkinsFile for multimodal with latest main Signed-off-by: Abhishree * Upgrade PTL to version 2.2 in reqs Signed-off-by: Abhishree * Install PTL 2.2 from fork Signed-off-by: Abhishree * Add strict arg to load_model_state_dict func in NLPDDPStrategy Signed-off-by: Abhishree * Delete megatron_t5_adapter_tuning.py, megatron_t5_ia3_tuning.py These files were added in the branch by mistake Signed-off-by: Abhishree * Delete megatron_t5_prompt_learning.py that got added by mistake Signed-off-by: Abhishree * Add appropriate comments, code clean up Signed-off-by: Abhishree * Remove PTL installation from JenkinsFile Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update PTL version to be >= 2.2.1 Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> --------- Signed-off-by: Abhishree Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- Jenkinsfile | 35 ++++++----- .../megatron_bart_pretraining.py | 3 + .../megatron_change_num_partitions.py | 3 + .../megatron_retro_fine_tune.py | 3 + .../megatron_retro_mutransfer_pretrain.py | 3 + .../megatron_retro_pretraining.py | 3 + .../megatron_t5_lm_adaptation_finetune.py | 3 + .../megatron_t5_seq2seq_eval.py | 3 + .../megatron_t5_seq2seq_finetune.py | 3 + .../tuning/megatron_gpt_finetuning.py | 5 ++ .../tuning/megatron_gpt_sft.py | 3 + .../megatron_nmt_training.py | 3 + .../models/multimodal_llm/neva/neva_model.py | 18 +++--- .../text_to_image/controlnet/controlnet.py | 8 +-- .../text_to_image/dreambooth/dreambooth.py | 12 ++-- .../models/text_to_image/imagen/imagen.py | 12 ++-- .../stable_diffusion/ldm/ddpm.py | 12 ++-- .../clip/megatron_clip_models.py | 14 ++--- .../language_modeling/megatron_bert_model.py | 21 ++++--- .../language_modeling/megatron_gpt_model.py | 49 ++++++++++----- .../megatron_gpt_prompt_learning_model.py | 22 +++---- .../megatron_gpt_sft_model.py | 40 ++++++------- .../megatron_lm_encoder_decoder_model.py | 60 +++++++++---------- .../megatron_retrieval_model.py | 2 +- .../megatron_t5_adapter_model.py | 8 +-- .../megatron_t5_sft_model.py | 30 +++++----- .../machine_translation/megatron_nmt_model.py | 25 ++++---- .../nlp/parts/megatron_trainer_builder.py | 1 + nemo/collections/nlp/parts/nlp_overrides.py | 10 ++-- .../megatron_vit_classification_models.py | 14 ++--- nemo/utils/exp_manager.py | 4 +- requirements/requirements_lightning.txt | 2 +- .../convert_hf_llama_to_nemo.py | 3 +- tests/core/test_exp_manager.py | 4 +- 34 files changed, 243 insertions(+), 198 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ecd78365c787..cfd5853a6882 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -126,17 +126,17 @@ pipeline { } } - stage('L0: Unit Tests CPU') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - steps { - sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat' - } - } + stage('L0: Unit Tests CPU') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + steps { + sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat' + } + } stage('L2: Multimodal Imagen Train') { when { @@ -4082,7 +4082,6 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" } } - // @athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0 stage('L2: Megatron GPT Finetuning PP=2') { when { anyOf { @@ -4114,13 +4113,13 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.data.train_ds.num_workers=0 \ model.data.test_ds.micro_batch_size=1 \ model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ model.data.test_ds.names=[quarel] \ model.data.validation_ds.micro_batch_size=1 \ model.data.validation_ds.global_batch_size=1 \ model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ + model.data.validation_ds.names=[quarel,trec]" sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.devices=2 \ trainer.log_every_n_steps=1 \ @@ -4143,13 +4142,13 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.data.train_ds.num_workers=0 \ model.data.test_ds.micro_batch_size=1 \ model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ model.data.test_ds.names=[quarel] \ model.data.validation_ds.micro_batch_size=1 \ model.data.validation_ds.global_batch_size=1 \ model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ + model.data.validation_ds.names=[quarel,trec]" sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" } } diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py index c2ba020a4a21..447c34426602 100644 --- a/examples/nlp/language_modeling/megatron_bart_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py @@ -60,6 +60,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index e135835292a3..436661e01b5d 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -935,6 +935,9 @@ def main(): plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + precision = None trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision) if tp_size < 0 or pp_size < 0: diff --git a/examples/nlp/language_modeling/megatron_retro_fine_tune.py b/examples/nlp/language_modeling/megatron_retro_fine_tune.py index aa7de6fda582..1577faa69a2b 100644 --- a/examples/nlp/language_modeling/megatron_retro_fine_tune.py +++ b/examples/nlp/language_modeling/megatron_retro_fine_tune.py @@ -99,6 +99,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py index 81a71650dc42..abe7006448e2 100644 --- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py +++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py @@ -63,6 +63,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py index c1393863da57..909260856eef 100644 --- a/examples/nlp/language_modeling/megatron_retro_pretraining.py +++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py @@ -62,6 +62,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py index e2af0b89ac48..0777d1f40819 100644 --- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py @@ -61,6 +61,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index 4c11e10d99c5..ba8ea6492da3 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -94,6 +94,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 3204ba2f6d76..13be61f5b1c5 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -174,6 +174,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py index aaa087a46623..1e6f680fad7e 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py @@ -56,7 +56,12 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + # cfg.trainer.precision becomes None in TrainerBuilder if precision_plugins exist since both precision plugins and precision + # can't exist in PTL >= 2.1, hence storing precision value from cfg.trainer.precision as its used for future steps like in merge_cfg_with func. + precision = cfg.trainer.precision trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + # Restore the precision value after Trainer is built. + cfg.trainer.precision = precision exp_manager(trainer, cfg.exp_manager) model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py index 44d0737ad44e..fbaacbb7bff4 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py @@ -199,6 +199,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py index 38b993479b3c..7946500f92e9 100644 --- a/examples/nlp/machine_translation/megatron_nmt_training.py +++ b/examples/nlp/machine_translation/megatron_nmt_training.py @@ -66,6 +66,9 @@ def main(cfg) -> None: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index f0137fd28722..44ab4785e8de 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -613,16 +613,16 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None) output_tensor = self.model(**forward_args) return output_tensor - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None): - return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step) + def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): + return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step) - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ We pass the dataloader iterator function to the micro-batch scheduler. The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ - return MegatronGPTModel.training_step(self, dataloader_iter, batch_idx) + return MegatronGPTModel.training_step(self, dataloader_iter) def get_forward_output_and_loss_func(self, validation_step=False, tuning=False): def loss_func(output_tensor, loss_mask): @@ -634,7 +634,7 @@ def loss_func(output_tensor, loss_mask): return loss_for_ub, dict(avg=reduced_loss[0].unsqueeze(0)) def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) if parallel_state.get_pipeline_model_parallel_world_size() == 1: for k in batch.keys(): if self.get_attention_mask_from_fusion: @@ -690,7 +690,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ def get_forward_output_only_func(self): def fwd_output_only_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) extra_arg = {} ( tokens, @@ -744,8 +744,8 @@ def id_func(output_tensor): return fwd_output_only_func - def validation_step(self, dataloader_iter, batch_idx): - return MegatronGPTModel.validation_step(self, dataloader_iter, batch_idx) + def validation_step(self, dataloader_iter): + return MegatronGPTModel.validation_step(self, dataloader_iter) def on_validation_epoch_end(self): if not self.validation_step_outputs: @@ -775,7 +775,7 @@ def on_validation_epoch_start(self): pass def test_step(self, batch, batch_idx): - return self.validation_step(batch, batch_idx) + return self.validation_step(batch) def test_epoch_end(self, outputs): averaged_loss = average_losses_across_data_parallel_group(outputs) diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py index 36329c3b7d0f..3f59eb66c81a 100644 --- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py +++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py @@ -678,7 +678,7 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx=0): batch[self.cfg.first_stage_key] = batch[self.cfg.first_stage_key].cuda(non_blocking=True) self.model.on_train_batch_start(batch, batch_idx) - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): tensor_shape = None # Placeholder # handle asynchronous grad reduction @@ -726,7 +726,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean, loss_dict - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -738,7 +738,7 @@ def training_step(self, dataloader_iter, batch_idx): # we zero grads here because we also call backward in the apex fwd/bwd functions self._optimizer.zero_grad() - loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False) if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): self.allreduce_sequence_parallel_gradients() @@ -827,7 +827,7 @@ def process_batch(batch): return [x, *c_list] def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) batch = process_batch(batch) batch = [x.cuda(non_blocking=True) for x in batch] if len(self.conditioning_keys) == 0: diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py index 704f8b39371a..317cdf5d6364 100644 --- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py +++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py @@ -241,7 +241,7 @@ def forward(self, batch): output_tensor = self.model(batch) return output_tensor - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): tensor_shape = None # Placeholder # handle asynchronous grad reduction @@ -290,7 +290,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean, loss_dict - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -303,7 +303,7 @@ def training_step(self, dataloader_iter, batch_idx): # we zero grads here because we also call backward in the apex fwd/bwd functions self._optimizer.zero_grad() - loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False) torch.distributed.broadcast(loss_mean, get_last_rank()) @@ -344,8 +344,8 @@ def training_step(self, dataloader_iter, batch_idx): ) return loss_mean - def validation_step(self, dataloader_iter, batch_idx): - loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True) + def validation_step(self, dataloader_iter): + loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, True) self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1) @@ -394,7 +394,7 @@ def process_batch(batch): return images, cond def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) batch = process_batch(batch) batch = [x.cuda(non_blocking=True) for x in batch] loss = model(batch) diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py index 90487eac61dc..4fa6cd230e03 100644 --- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py +++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py @@ -248,7 +248,7 @@ def process_batch(batch): return [x_start, text_embed, text_mask, x_lowres] def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) batch = process_batch(batch) batch = [x.cuda(non_blocking=True) for x in batch] loss, loss_dict = model(*batch) @@ -326,7 +326,7 @@ def setup_test_data(self, cfg): self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True, ) - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): tensor_shape = None # handle asynchronous grad reduction @@ -377,7 +377,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean, loss_dict - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -390,7 +390,7 @@ def training_step(self, dataloader_iter, batch_idx): # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() - loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False) torch.distributed.broadcast(loss_mean, get_last_rank()) @@ -458,14 +458,14 @@ def _append_sequence_parallel_module_grads(self, module, grads): grad = param.grad grads.append(grad.data) - def validation_step(self, dataloader_iter, batch_idx): + def validation_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size from the dataloader to produce a list of microbatches. The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. """ - loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True) + loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, True) self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1) return loss diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py index 36dfb74fbfaf..61bb664e43ed 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py @@ -1716,7 +1716,7 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx=0): batch[self.cfg.first_stage_key] = batch[self.cfg.first_stage_key].cuda(non_blocking=True) self.model.on_train_batch_start(batch, batch_idx) - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): tensor_shape = None # Placeholder # handle asynchronous grad reduction @@ -1780,7 +1780,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean, loss_dict - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -1793,7 +1793,7 @@ def training_step(self, dataloader_iter, batch_idx): # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() - loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False) # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): @@ -1902,7 +1902,7 @@ def process_batch(batch): return [x, *c_list] def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) batch = process_batch(batch) batch = [x.cuda(non_blocking=True) for x in batch] if len(self.conditioning_keys) == 0: @@ -1928,8 +1928,8 @@ def fwd_output_only_func(batch, model): return fwd_output_only_func - def validation_step(self, dataloader_iter, batch_idx): - loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True) + def validation_step(self, dataloader_iter): + loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, True) self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1) diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index ea325a4a2839..fe35ae148026 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -452,7 +452,7 @@ def forward(self, image, text): output_tensor = self.model(image, text) return output_tensor - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): # handle asynchronous grad reduction no_sync_func = None @@ -523,7 +523,7 @@ def initialize_ub_func(self): ) self.initialize_ub = False - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -557,7 +557,7 @@ def training_step(self, dataloader_iter, batch_idx): for param in module.embedding.parameters(): param.data_ptr() - loss_mean = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean = self.fwd_bwd_step(dataloader_iter, False) # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): @@ -649,7 +649,7 @@ def get_forward_output_and_loss_func(self): loss_func = ClipLoss(local_loss=self.cfg.local_loss, gather_with_grad=self.cfg.gather_with_grad,) def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) if parallel_state.get_pipeline_model_parallel_world_size() == 1: images = batch["images"].cuda(non_blocking=True) captions = batch["captions"].cuda(non_blocking=True) @@ -739,7 +739,7 @@ def accuracy(output, target, topk=(1,)): top5 = top5 / n return top1, top5 - def validation_step(self, dataloader_iter, batch_idx): + def validation_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -749,7 +749,7 @@ def validation_step(self, dataloader_iter, batch_idx): if self.initialize_ub: self.initialize_ub_func() - loss = self.fwd_bwd_step(dataloader_iter, batch_idx, True) + loss = self.fwd_bwd_step(dataloader_iter, True) self.validation_step_outputs.append(loss) return loss @@ -785,7 +785,7 @@ def on_validation_epoch_end(self): return averaged_loss def test_step(self, batch, batch_idx): - return self.validation_step(batch, batch_idx) + return self.validation_step(batch) def test_epoch_end(self, outputs): averaged_loss = average_losses_across_data_parallel_group(outputs) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 49b64268e6b9..29e1d2656cdf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -210,7 +210,7 @@ def _validate_trainer(self): def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): if parallel_state.get_pipeline_model_parallel_world_size() == 1: - batch = next(dataloader_iter) + batch, batch_idx, dataloader_idx = next(dataloader_iter) tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = ( batch['text'].cuda(non_blocking=True), batch['types'].cuda(non_blocking=True), @@ -220,7 +220,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ batch['padding_mask'].cuda(non_blocking=True), ) else: - batch = next(dataloader_iter) + batch, batch_idx, dataloader_idx = next(dataloader_iter) if parallel_state.is_pipeline_first_stage(): tokens = batch['text'].cuda(non_blocking=True) types = batch['types'].cuda(non_blocking=True) @@ -238,6 +238,9 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ sentence_order = batch['is_random'].cuda(non_blocking=True) tokens, types, loss_mask, lm_labels = None, None, None, None + dataloader_iter._dataloader_idx = dataloader_idx + dataloader_iter._batch_idx = batch_idx + if not self.cfg.bert_binary_head: types = None @@ -309,7 +312,7 @@ def forward( return output_tensor - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): self._optimizer.zero_grad() @@ -391,7 +394,7 @@ def training_step(self, dataloader_iter, batch_idx): if loss_scale is not None: self.log('loss_scale', loss_scale, batch_size=1) - if (batch_idx + 1) % self.trainer.accumulate_grad_batches == 0: + if (dataloader_iter._batch_idx + 1) % self.trainer.accumulate_grad_batches == 0: # Reduced loss for logging. self.log('reduced_train_loss', loss_mean[0], prog_bar=True, batch_size=1) if len(loss_mean) > 2: @@ -497,11 +500,7 @@ def allreduce_first_last_embeddings(self): grad = word_embeddings_weight.grad torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) - def validation_step(self, dataloader_iter, batch_idx): - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return + def validation_step(self, dataloader_iter): prefix = "test" if self.trainer.testing else "val" if self.cfg.data.dataloader_type == "LDDL": seq_length = dataloader_iter.iterator.get_seqlen() @@ -542,8 +541,8 @@ def on_validation_epoch_end(self): self.log('val_loss', averaged_loss, prog_bar=True, batch_size=1) self.validation_step_outputs.clear() # free memory - def test_step(self, batch, batch_idx): - return self.validation_step(batch, batch_idx) + def test_step(self, dataloader_iter): + return self.validation_step(dataloader_iter) def on_test_epoch_end(self): averaged_loss = average_losses_across_data_parallel_group(self.test_step_outputs) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a63dfc7c5ce4..ac35af38de64 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -27,6 +27,7 @@ from omegaconf.dictconfig import DictConfig from pkg_resources import packaging from pytorch_lightning.accelerators import CPUAccelerator +from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.common.parts.utils import extend_instance @@ -550,7 +551,7 @@ def forward(self, tokens, text_position_ids, attention_mask, labels): output_tensor = self.model(tokens, text_position_ids, attention_mask, labels=labels) return output_tensor - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None): + def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): # handle asynchronous grad reduction no_sync_func = None @@ -634,7 +635,7 @@ def initialize_ub_func(self): ) self.initialize_ub = False - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ We pass the dataloader iterator function to the micro-batch scheduler. The input batch to each micro-batch is fetched using the dataloader function @@ -673,7 +674,7 @@ def training_step(self, dataloader_iter, batch_idx): for param in module.embedding.parameters(): param.data_ptr() - loss_mean = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean = self.fwd_bwd_step(dataloader_iter, False) if self.cfg.get('fp8', False): self.prev_step_training = self.training @@ -925,7 +926,13 @@ def get_batch(self, data_iterator, tuning): # Broadcast data. if data_iterator is not None: - data = next(data_iterator) + # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple + # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch + # from the data_iterator + if isinstance(data_iterator, _DataFetcherWrapper): + data, _, _ = next(data_iterator) + else: + data = next(data_iterator) else: data = None @@ -1077,7 +1084,13 @@ def loss_func(output_tensor): def get_forward_output_only_func(self): def fwd_output_only_func(dataloader_iter, model): - batch = next(dataloader_iter) + # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple + # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch + # from the data_iterator + if isinstance(dataloader_iter, _DataFetcherWrapper): + batch, _, _ = next(dataloader_iter) + else: + batch = next(dataloader_iter) extra_arg = {} if len(batch) == 3: batch = [x.cuda() for x in batch] @@ -1127,17 +1140,13 @@ def id_func(output_tensor): return fwd_output_only_func - def validation_step(self, dataloader_iter, batch_idx): + def validation_step(self, dataloader_iter, dataloader_idx=0): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size from the dataloader to produce a list of microbatches. The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. """ - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return mode = 'test' if self.trainer.testing else 'val' # Initialize userbuffer communicators. if self.initialize_ub: @@ -1153,12 +1162,24 @@ def validation_step(self, dataloader_iter, batch_idx): else: first_val_step = None - loss = self.fwd_bwd_step(dataloader_iter, batch_idx, True, first_val_step) + loss = self.fwd_bwd_step(dataloader_iter, True, first_val_step) if isinstance(self.model, list): for model_module in self.model: model_module.train() - self.validation_step_outputs.append(loss) if mode == 'val' else self.test_step_outputs.append(loss) + + if mode == 'val': + # Append with the correct dataloader_idx in case of multiple dataloaders + if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: + self.validation_step_outputs[dataloader_idx].append(loss) + else: + self.validation_step_outputs.append(loss) + else: + if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: + self.test_step_outputs[dataloader_idx].append(loss) + else: + self.test_step_outputs.append(loss) + return loss def on_validation_epoch_end(self): @@ -1194,8 +1215,8 @@ def on_validation_epoch_end(self): return averaged_loss - def test_step(self, batch, batch_idx): - return self.validation_step(batch, batch_idx) + def test_step(self, dataloader_iter): + return self.validation_step(dataloader_iter) def on_test_epoch_end(self): averaged_loss = average_losses_across_data_parallel_group(self.test_step_outputs) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index daa0c6dd02fa..617a585ef3a9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -309,7 +309,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. """ # Get seq length of batch - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) _, seq_length = batch[0].shape data_iter = get_iterator_k_split(batch, get_num_microbatches()) @@ -337,10 +337,10 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() - batch = next(dataloader_iter) + batch, batch_idx, _ = next(dataloader_iter) loss_mean = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=False) self.allreduce_gradients() @@ -373,13 +373,9 @@ def optimizer_zero_grad(self, *args, **kwargs): """ return - def validation_step(self, dataloader_iter, batch_idx): - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return + def validation_step(self, dataloader_iter): mode = 'test' if self.trainer.testing else 'val' - batch = next(dataloader_iter) + batch, batch_idx, _ = next(dataloader_iter) gbs = self.cfg.get('validation_global_batch_size', self.cfg.global_batch_size) self._reconfigure_and_process_inference_batch(batch[0].size(0), gbs) loss_mean = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=True) @@ -503,8 +499,8 @@ def on_validation_epoch_end(self): self._reconfigure_batch_sizes(gbs, mbs) self.validation_step_outputs.clear() # free memory - def test_step(self, dataloader_iter, batch_idx): - return self.validation_step(dataloader_iter, batch_idx) + def test_step(self, dataloader_iter): + return self.validation_step(dataloader_iter) def on_test_epoch_end(self): averaged_loss = average_losses_across_data_parallel_group(self.test_step_outputs) @@ -661,7 +657,7 @@ def set_input_tensor(self, input_tensor): def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) batch = [x.cuda(non_blocking=True) for x in batch] input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids = batch output_tensor = model(input_ids, position_ids, attention_mask, taskname_ids, labels, inference=False) @@ -684,7 +680,7 @@ def get_forward_output_only_func(self): """ def fwd_output_only_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) extra_arg = {} ( tokens, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 3777047780f2..331f977a3265 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -18,6 +18,7 @@ import torch from omegaconf import DictConfig, ListConfig +from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.common.metrics import MetricStringToTorchMetric @@ -184,13 +185,6 @@ def setup(self, stage=None): if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) - # Raise error if using multiple dataloaders - if type(self._validation_dl) == list and len(self._validation_dl) > 1: - raise NotImplementedError('Lightning 2.0 does not support multiple dataloaders with dataloader_iter') - - if type(self._test_dl) == list and len(self._test_dl) > 1: - raise NotImplementedError('Lightning 2.0 does not support multiple dataloaders with dataloader_iter') - # when using pipeline model parallel the final stage need to initialize word embeddings self.initialize_last_rank_embeddings() @@ -327,8 +321,13 @@ def _determine_log_key(self, data_config, dataloader_idx, metric_name, mode): else: return base_key + f"dataloader{dataloader_idx}" - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None): - batch = next(dataloader_iter) + def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): + # Return only batch if batch, batch_idx, dataloder_idx are extracted as a tuple in the previous func + # call like validation_step otherwise return tuple (in which case dataloader_iter is still a PTL _DataFetcherWrapper object) + if isinstance(dataloader_iter, _DataFetcherWrapper): + batch, _, _ = next(dataloader_iter) + else: + batch = next(dataloader_iter) log_token_counts = self.cfg.get('log_token_counts', False) if log_token_counts: @@ -399,24 +398,21 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step= return loss_mean - def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0): - return self.inference_step(dataloader_iter, batch_idx, 'validation', dataloader_idx) + def validation_step(self, dataloader_iter): + return self.inference_step(dataloader_iter, 'validation') - def test_step(self, dataloader_iter, batch_idx, dataloader_idx=0): - # Add try except since dataloader_iter in PTL 2.0 doesnt catch the end of iterables - return self.inference_step(dataloader_iter, batch_idx, 'test', dataloader_idx) + def test_step(self, dataloader_iter): + return self.inference_step(dataloader_iter, 'test') - def inference_step(self, dataloader_iter, batch_idx, mode, dataloader_idx=0): - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return - batch = next(dataloader_iter) + def inference_step(self, dataloader_iter, mode): + batch, batch_idx, dataloader_idx = next(dataloader_iter) data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds self._reconfigure_and_process_inference_batch(batch, data_cfg) # Meta data from dataset metadata = batch.get('metadata', [{}] * len(batch['tokens'])) - loss = super().validation_step(itertools.chain([batch]), batch_idx) + # Pass dataloader_idx, as it's needed in val_step of GPTModel to append the loss correctly to self.val/test_step_outputs + # in case of multi dataloaders + loss = super().validation_step(itertools.chain([batch]), dataloader_idx) if data_cfg.get("write_predictions_to_file", False) or data_cfg.metric.name != 'loss': # We need _inference_config to get generation params @@ -460,7 +456,7 @@ def inference_step(self, dataloader_iter, batch_idx, mode, dataloader_idx=0): def inference_epoch_end(self, outputs, mode, data_cfg): # Parent class will handle logging of the loss. - if not outputs: + if not outputs or not outputs[0]: return if isinstance(outputs[0], dict): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 384acd599e40..38c887304f7a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -21,6 +21,7 @@ from omegaconf import OmegaConf, open_dict from omegaconf.dictconfig import DictConfig from pytorch_lightning.accelerators import CPUAccelerator +from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import ( @@ -338,7 +339,7 @@ def _execute_fwd_bwd_function(self, data_iterator, forward_only, tensor_shape, d return mean_loss_dict - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): """ Dataloader produces a global batch which is turned into a list of microbatches. The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. @@ -353,7 +354,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): decoder_seq_length=self.max_decoder_seq_length, ) - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -365,7 +366,7 @@ def training_step(self, dataloader_iter, batch_idx): # we zero grads here because we also call backward in the megatron fwd/bwd functions self._optimizer.zero_grad() - loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_dict = self.fwd_bwd_step(dataloader_iter, False) if self.with_distributed_adam: # synchronize asynchronous grad reductions @@ -566,7 +567,13 @@ def _process_batch(self, global_batch: Dict[str, torch.Tensor]) -> List[torch.Te def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple + # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch + # from the data_iterator + if isinstance(dataloader_iter, _DataFetcherWrapper): + batch, _, _ = next(dataloader_iter) + else: + batch = next(dataloader_iter) # convert to list if not already converted. if isinstance(batch, dict): # convert to list if not already converted. @@ -679,7 +686,11 @@ def _get_forward_output_only_func(self, arg_names, output_name, **kwargs): """ def fwd_output_only_func(dataloader_iter, model): - batch = next(dataloader_iter) + # Extract batch, batch_idx, dataloader_idx only if dataloader_iter is an object of PTL's _DataFetcherWrapper + if isinstance(dataloader_iter, _DataFetcherWrapper): + batch, _, _ = next(dataloader_iter) + else: + batch = next(dataloader_iter) batch = [x.cuda(non_blocking=True) if torch.is_tensor(x) else x for x in batch] # map batch and shared args into forward args @@ -699,48 +710,31 @@ def id_func(output_tensor): ########## - def _test_validation_step(self, step_outputs, dataloader_iter, batch_idx, dataloader_idx=0): + def _test_validation_step(self, dataloader_iter): """ Shared code for validation and test step """ - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return - loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True) - step_outputs.append(loss_dict) + loss_dict = self.fwd_bwd_step(dataloader_iter, True) return loss_dict - def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0): + def validation_step(self, dataloader_iter): """ return_values - if given, returns a dictionary with given keys and corresponding values """ + outputs = self._test_validation_step(dataloader_iter=dataloader_iter) if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - step_outputs = self.validation_step_outputs[dataloader_idx] + self.validation_step_outputs[dataloader_iter.dataloader_idx].append(outputs) else: - step_outputs = self.validation_step_outputs - - return self._test_validation_step( - step_outputs=step_outputs, - dataloader_iter=dataloader_iter, - batch_idx=batch_idx, - dataloader_idx=dataloader_idx, - ) + self.validation_step_outputs.append(outputs) - def test_step(self, dataloader_iter, batch_idx, dataloader_idx=0): - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - step_outputs = self.test_step_outputs[dataloader_idx] + def test_step(self, dataloader_iter): + outputs = self._test_validation_step(dataloader_iter=dataloader_iter) + if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: + self.test_step_outputs[dataloader_iter.dataloader_idx].append(outputs) else: - step_outputs = self.test_step_outputs - - return self._test_validation_step( - step_outputs=step_outputs, - dataloader_iter=dataloader_iter, - batch_idx=batch_idx, - dataloader_idx=dataloader_idx, - ) + self.test_step_outputs.append(outputs) def _test_validation_epoch_end(self, step_outputs, prefix): """ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index d10c9f27f6cb..ebe936a8178a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -325,7 +325,7 @@ def validation_step(self, batch, batch_idx): if prefix == 'val': self.validation_step_outputs.append(reduced_loss) else: - self.test_step_outputs.apped(reduced_loss) + self.test_step_outputs.append(reduced_loss) return reduced_loss def on_validation_epoch_end(self): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py index d1332831ef1d..31eb4519ded2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py @@ -146,12 +146,8 @@ def compute_accuracy(self, enc_input, enc_mask, encoder_input, labels): 'enc_inputs': processed_inputs, } - def validation_step(self, dataloader_iter, batch_idx, inference=False): - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return - batch = next(dataloader_iter) + def validation_step(self, dataloader_iter): + batch, batch_idx, _ = next(dataloader_iter) enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids = batch mode = self.training diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py index 22483731a534..0b32530668be 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py @@ -17,6 +17,7 @@ import torch from omegaconf import DictConfig, ListConfig +from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.common.data import ConcatMapDataset @@ -26,7 +27,6 @@ from nemo.collections.nlp.data.language_modeling.megatron.t5_sft_dataset import T5SFTDataset from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model, T5Sentinel from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split - from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.utils import AppState, logging @@ -288,12 +288,18 @@ def _reconfigure_and_process_inference_batch(self, batch, ds_config): data_parallel_size=parallel_state.get_data_parallel_world_size(), ) - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): """ Dataloader produces a global batch which is turned into a list of microbatches. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ - batch = next(dataloader_iter) + # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple + # from the dataloader_iter are already extracted in the child class. In that case extact only the batch + # from the data_iterator + if isinstance(dataloader_iter, _DataFetcherWrapper): + batch, _, _ = next(dataloader_iter) + else: + batch = next(dataloader_iter) if isinstance(batch, dict): # convert to list if not already converted. batch = self._process_batch(batch) @@ -312,14 +318,10 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): decoder_seq_length=decoder_seq_length, ) - def inference_step(self, dataloader_iter, batch_idx: int, mode: str, dataloader_idx=0): - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return + def inference_step(self, dataloader_iter, mode: str): # Regular finetuning datasets will return a list of dicts for each microbatch. # But T0 datasets will return a single dict for the global batch. - batch = next(dataloader_iter) + batch, batch_idx, dataloader_idx = next(dataloader_iter) batch_has_lang_information = isinstance(batch, list) and len(batch[0]) == 7 data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds @@ -327,7 +329,7 @@ def inference_step(self, dataloader_iter, batch_idx: int, mode: str, dataloader_ # NOTE: There could be extra keys in the processed_batch dictionary such as "langs" for XNLI, # this will be ignored. - loss = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=True) + loss = self.fwd_bwd_step(itertools.chain([batch]), forward_only=True) predicted_token_ids, _ = self.decode( tokens_enc=batch['text_enc'], @@ -589,16 +591,16 @@ def write_predictions_to_file(self, outputs, output_file_path_prefix): for i, p, l in zip(outputs['inputs'], outputs['preds'], outputs['labels']): f_json.write(json.dumps({'input': i, 'pred': p, 'label': l}) + '\n') - def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0): - return self.inference_step(dataloader_iter, batch_idx, 'validation', dataloader_idx) + def validation_step(self, dataloader_iter): + return self.inference_step(dataloader_iter, 'validation') def on_validation_epoch_end(self): _ = self.inference_epoch_end(self.validation_step_outputs, 'validation', self.cfg.data.validation_ds) # Commenting as on_validation_epoch_end was a no-op in PTL 1.9 # return super().on_validation_epoch_end() - def test_step(self, dataloader_iter, batch_idx, dataloader_idx=0): - return self.inference_step(dataloader_iter, batch_idx, 'test', dataloader_idx) + def test_step(self, dataloader_iter): + return self.inference_step(dataloader_iter, 'test') def on_test_epoch_end(self): _ = self.inference_epoch_end(self.test_step_outputs, 'test', self.cfg.data.test_ds) diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index 5deac0c43e67..952c76ce929e 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -21,6 +21,7 @@ import torch from omegaconf.dictconfig import DictConfig from omegaconf.listconfig import ListConfig +from pytorch_lightning.loops.fetchers import _DataFetcherWrapper from pytorch_lightning.trainer.trainer import Trainer from sacrebleu import corpus_bleu @@ -286,12 +287,18 @@ def _build_vocab(self): tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1), ) - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): """ Dataloader produces a global batch which is turned into a list of microbatches. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ - batch = next(dataloader_iter) + # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple + # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch + # from the data_iterator + if isinstance(dataloader_iter, _DataFetcherWrapper): + batch, _, _ = next(dataloader_iter) + else: + batch = next(dataloader_iter) if isinstance(batch, dict): # convert to list if not already converted. batch = self._process_batch(batch) @@ -310,13 +317,9 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): decoder_seq_length=decoder_seq_length, ) - def eval_step(self, dataloader_iter, batch_idx, dataloader_idx=0): - # Check if iterator is exhausted - dataloader_iter, done = self._val_iterator_done(dataloader_iter) - if done: - return + def eval_step(self, dataloader_iter): # Need to squeze dim 0 for old NMT datasets since things are pre-batched and we ask the dataloader for batch size 1. - batch = next(dataloader_iter) + batch, _, dataloader_idx = next(dataloader_iter) batch = [x.squeeze(dim=0) if x.ndim == 3 else x for x in batch] batch = self.process_global_batch_for_text_translation_datasets(batch) @@ -330,7 +333,7 @@ def eval_step(self, dataloader_iter, batch_idx, dataloader_idx=0): data_parallel_size=parallel_state.get_data_parallel_world_size(), ) # This returns the averaged loss across data-parallel groups. - reduced_loss = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, True) + reduced_loss = self.fwd_bwd_step(itertools.chain([batch]), True) tokens_enc, labels, enc_mask = batch['text_enc'], batch['labels'], batch['enc_mask'] @@ -400,12 +403,12 @@ def postprocess_outputs(self, outputs, tokenizer, processor): return results - def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0): + def validation_step(self, dataloader_iter): """ Lightning calls this inside the validation loop with the data from the validation dataloader passed in as `batch`. """ - return self.eval_step(dataloader_iter, batch_idx, dataloader_idx) + return self.eval_step(dataloader_iter) def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset): rank = parallel_state.get_data_parallel_rank() diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py index 0cd1563b2849..055671219fb8 100644 --- a/nemo/collections/nlp/parts/megatron_trainer_builder.py +++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py @@ -114,6 +114,7 @@ def _plugins(self) -> list: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + self.cfg.trainer.precision = None if self.cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 48fe9034ad25..66fa99ffefd1 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -33,7 +33,6 @@ from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.loops.fetchers import _DataFetcher -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import MixedPrecisionPlugin @@ -219,7 +218,7 @@ def configure_ddp(self): hasattr(self.model, 'with_distributed_adam') and self.model.with_distributed_adam ): # do not use DDP if using megatron amp O2 or distributed optimizer - self._model = _LightningModuleWrapperBase(self.model) + self._model = self.model else: app_state = AppState() @@ -236,7 +235,7 @@ def configure_ddp(self): # self.pre_configure_ddp() # device_ids = self.determine_ddp_device_ids() self._model = DistributedDataParallel( - _LightningModuleWrapperBase(self.model), + self.model, process_group=parallel_state.get_data_parallel_group(with_context_parallel=True), **self._ddp_kwargs, ) @@ -360,7 +359,8 @@ def save_checkpoint( if self.is_global_zero or app_state.data_parallel_rank == 0: self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) - def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + # PTL 2.2 supports non strict loading of the ckpt with the strict arg (https://github.com/Lightning-AI/pytorch-lightning/pull/19404) + def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None: # if using distributed checkpointing, the state dict logic is at the model level if ( hasattr(self.lightning_module, 'sharded_state_dict') @@ -390,7 +390,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None: new_state_dict[new_key] = checkpoint['state_dict'][key] checkpoint['state_dict'] = new_state_dict - self.lightning_module.load_state_dict(checkpoint["state_dict"]) + self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict) def _fix_tensors_device(self, ckpt: Dict) -> Dict: """ Ensure checkpoint tensors are on the correct device.""" diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py index 2ced8c8ecc08..c27c37c2b917 100644 --- a/nemo/collections/vision/models/megatron_vit_classification_models.py +++ b/nemo/collections/vision/models/megatron_vit_classification_models.py @@ -275,7 +275,7 @@ def forward(self, tokens): output_tensor = self.model(tokens) return output_tensor - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): + def fwd_bwd_step(self, dataloader_iter, forward_only): # handle asynchronous grad reduction no_sync_func = None @@ -351,7 +351,7 @@ def initialize_ub_func(self): ) self.initialize_ub = False - def training_step(self, dataloader_iter, batch_idx): + def training_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -367,7 +367,7 @@ def training_step(self, dataloader_iter, batch_idx): # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() - loss_mean, _ = self.fwd_bwd_step(dataloader_iter, batch_idx, False) + loss_mean, _ = self.fwd_bwd_step(dataloader_iter, False) # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): @@ -477,7 +477,7 @@ def loss_func(labels, output_tensor): return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]} def fwd_output_and_loss_func(dataloader_iter, model): - batch = next(dataloader_iter) + batch, _, _ = next(dataloader_iter) if parallel_state.get_pipeline_model_parallel_world_size() == 1: batch = [x.cuda(non_blocking=True) for x in batch] tokens, labels = batch @@ -506,7 +506,7 @@ def fwd_output_only_func(batch, model): return fwd_output_only_func - def validation_step(self, dataloader_iter, batch_idx): + def validation_step(self, dataloader_iter): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size @@ -519,7 +519,7 @@ def validation_step(self, dataloader_iter, batch_idx): if self.initialize_ub: self.initialize_ub_func() - loss, accuracy = self.fwd_bwd_step(dataloader_iter, batch_idx, True) + loss, accuracy = self.fwd_bwd_step(dataloader_iter, True) self.validation_step_outputs.append((loss, accuracy)) if mode == 'val' else self.test_step_outputs.append( (loss, accuracy) @@ -554,7 +554,7 @@ def on_validation_epoch_end(self): return averaged_loss def test_step(self, batch, batch_idx): - return self.validation_step(batch, batch_idx) + return self.validation_step(batch) def on_test_epoch_end(self): pass diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index db45701385e8..7f915b82c820 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -1034,10 +1034,10 @@ class SkipResumeTrainingValidationLoop(_TrainingEpochLoop): the training state before validation has run. """ - def _should_check_val_fx(self) -> bool: + def _should_check_val_fx(self, data_fetcher) -> bool: if self.restarting and self.global_step % self.trainer.val_check_batch == 0: return False - return super()._should_check_val_fx() + return super()._should_check_val_fx(data_fetcher) def clean_exp_ckpt(exp_log_dir: Union[str, Path], remove_ckpt: bool = True, remove_nemo: bool = False): diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 7adea60957fe..ee9423b9115c 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -1,6 +1,6 @@ hydra-core>1.3,<=1.3.2 omegaconf<=2.3 -pytorch-lightning>=2.0,<=2.0.7 +pytorch-lightning>=2.2.1 torchmetrics>=0.11.0 transformers>=4.36.0 wandb diff --git a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py index 597d6f2ccc74..e50f7fa71f2d 100644 --- a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py @@ -133,7 +133,8 @@ def convert(args): nemo_config.precision = precision print(f"nemo_config: {nemo_config}") - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together. + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = hf_config["hidden_size"] head_num = hf_config["num_attention_heads"] diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index 8073a75e14ca..8883d6514119 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -946,7 +946,9 @@ def test_invalid_checkpoints_removed_from_topk(self, tmp_path): test_trainer2.fit(model) ckpt_filenames = {f.name for f in checkpoints_dir.rglob("*.ckpt") if f.is_file()} - assert len(ckpt_filenames) == 4 # 3 top + 1 last + # 3 top + 1 last + 1 resume ckpt since PTL >= 2.1 ensures to never delete the resume ckpt + # (https://github.com/Lightning-AI/pytorch-lightning/pull/18750) + assert len(ckpt_filenames) == 5 assert 'epoch=9-last.ckpt' in ckpt_filenames assert 'epoch=8.ckpt' in ckpt_filenames assert 'epoch=7.ckpt' in ckpt_filenames From 6be016c450bd680dc32d2d5179aea2dc56471c47 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 8 Mar 2024 11:30:23 -0800 Subject: [PATCH 006/140] Update docs for NeMo Framework (#8596) * Update docs version Signed-off-by: smajumdar * Update docs for NeMo Framework Signed-off-by: smajumdar * Update docs for NeMo Framework Signed-off-by: smajumdar --------- Signed-off-by: smajumdar --- docs/source/_static/css/custom.css | 8 ++- docs/source/conf.py | 6 +- docs/source/index.rst | 73 +++++++++++++---------- docs/source/starthere/intro.rst | 8 --- docs/source/starthere/migration-guide.rst | 4 +- docs/source/vision/intro.rst | 4 +- requirements/requirements_docs.txt | 10 ++-- 7 files changed, 60 insertions(+), 53 deletions(-) diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css index cf0ad0ff2d7f..2dae2661b353 100644 --- a/docs/source/_static/css/custom.css +++ b/docs/source/_static/css/custom.css @@ -1,3 +1,5 @@ +@import url("theme.css"); + body { font-size: 100%; font-family: 'NVIDIA Sans', sans-serif; @@ -40,13 +42,17 @@ p { } /* Link Colors */ +/* a { - color: #76b900; + color: #76b900; } +/* +/* a:visited { color: #218219; } +*/ .container-xl { margin-right: unset; diff --git a/docs/source/conf.py b/docs/source/conf.py index 0596b15e3de5..6d086cb42e9f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -203,6 +203,10 @@ # html_logo = html_theme_options["logo_path"] +# html_sidebars = { +# "**": ["navbar-logo.html", "search-field.html", "sbt-sidebar-nav.html"] +# } + # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. @@ -222,7 +226,7 @@ html_title = 'NVIDIA NeMo' html_theme_options = { - 'logo_only': True, + 'logo_only': False, 'display_version': True, # 'prev_next_buttons_location': 'bottom', # 'style_external_links': False, diff --git a/docs/source/index.rst b/docs/source/index.rst index 9d66d693000e..822431a9108a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,6 +1,12 @@ NVIDIA NeMo Framework Developer Docs ==================================== +.. include:: starthere/intro.rst + + +Index of NeMo Framework Developer Docs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. toctree:: :maxdepth: 2 :caption: Getting Started @@ -9,18 +15,19 @@ NVIDIA NeMo Framework Developer Docs starthere/intro starthere/tutorials starthere/best-practices - starthere/migration-guide + .. toctree:: - :maxdepth: 3 - :caption: Multimodal (MM) - :name: Multimodal + :maxdepth: 2 + :caption: NeMo Core + :name: core - multimodal/mllm/intro - multimodal/vlm/intro - multimodal/text2img/intro - multimodal/nerf/intro - multimodal/api + core/core + core/exp_manager + core/neural_types + core/export + core/adapters/intro + core/api .. toctree:: @@ -34,10 +41,11 @@ NVIDIA NeMo Framework Developer Docs nlp/megatron_onnx_export nlp/api + .. toctree:: :maxdepth: 2 - :caption: Speech Processing - :name: Speech Processing + :caption: Speech AI + :name: Speech AI asr/intro asr/speech_classification/intro @@ -46,6 +54,19 @@ NVIDIA NeMo Framework Developer Docs asr/ssl/intro asr/speech_intent_slot/intro + +.. toctree:: + :maxdepth: 3 + :caption: Multimodal (MM) + :name: Multimodal + + multimodal/mllm/intro + multimodal/vlm/intro + multimodal/text2img/intro + multimodal/nerf/intro + multimodal/api + + .. toctree:: :maxdepth: 1 :caption: Text To Speech (TTS) @@ -55,37 +76,16 @@ NVIDIA NeMo Framework Developer Docs .. toctree:: :maxdepth: 2 - :caption: Vision + :caption: Vision (CV) :name: vision vision/intro - -.. toctree:: - :maxdepth: 2 - :caption: NeMo Core - :name: core - - core/core - core/exp_manager - core/neural_types - core/export - core/adapters/intro - core/api - .. toctree:: :maxdepth: 2 :caption: Common :name: Common - text_processing/intro - -.. toctree:: - :maxdepth: 2 - :caption: Text Processing - :name: Text Processing - - text_processing/g2p/g2p common/intro @@ -95,3 +95,10 @@ NVIDIA NeMo Framework Developer Docs :name: Speech Tools tools/intro + +.. toctree:: + :maxdepth: 2 + :caption: Upgrade Guide + :name: Upgrade Guide + + starthere/migration-guide \ No newline at end of file diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst index 185350bad3ab..77a1ca0255a1 100644 --- a/docs/source/starthere/intro.rst +++ b/docs/source/starthere/intro.rst @@ -98,14 +98,6 @@ See the two introductory videos below for a high level overview of NeMo. -**NVIDIA NeMo: Toolkit for Conversational AI at PyData Yerevan 2022** - -.. raw:: html - -
- -
- .. _installation: Installation diff --git a/docs/source/starthere/migration-guide.rst b/docs/source/starthere/migration-guide.rst index 15b4940172c3..1d9816493a5b 100644 --- a/docs/source/starthere/migration-guide.rst +++ b/docs/source/starthere/migration-guide.rst @@ -1,5 +1,5 @@ -Migration guide to use lightning 2.0 -===================================== +Upgrade guide to use lightning 2.0 +================================== .. # define a hard line break for html .. |br| raw:: html diff --git a/docs/source/vision/intro.rst b/docs/source/vision/intro.rst index 6df5881e1121..4f4462404b90 100644 --- a/docs/source/vision/intro.rst +++ b/docs/source/vision/intro.rst @@ -1,5 +1,5 @@ -Foundation Vision Models in NeMo -================================ +Vision Models +============= NeMo has implemented foundational vision models, establishing a solid base for further exploration into multimodal applications. These foundational vision models can be leveraged in a variety of multimodal applications including multimodal language models and text to image generation tasks, among others. These foundation models not only lay the functional groundwork but also play a crucial role in achieving state-of-the-art performance on NVIDIA GPUs through our custom optimizations. diff --git a/requirements/requirements_docs.txt b/requirements/requirements_docs.txt index 8412c67d4ab2..ff3ec5202b0e 100644 --- a/requirements/requirements_docs.txt +++ b/requirements/requirements_docs.txt @@ -1,14 +1,12 @@ boto3 -Jinja2<3.1 +Jinja2 latexcodec numpy -# sphinx-book-theme is incompatible with pydata-sphinx-theme>0.13.2 -# https://github.com/executablebooks/sphinx-book-theme/issues/711 -pydata-sphinx-theme==0.13.1 -Sphinx>=4.0,<6,!=5.0.0 +pydata-sphinx-theme +Sphinx sphinx-book-theme sphinx-copybutton sphinxcontrib-bibtex sphinxext-opengraph -urllib3<2.0.0 +urllib3 wrapt From 5a3450dbdf65654fee1310dbcc205347537b3143 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Fri, 8 Mar 2024 15:28:50 -0500 Subject: [PATCH 007/140] Update results.rst for Canary Inference (#8562) * Update results.rst for Canary Inference Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update results.rst for Canary Inference Signed-off-by: Krishna Puvvada --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada --- docs/source/asr/results.rst | 51 +++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst index b38a661a0ea9..05f91dde88ae 100644 --- a/docs/source/asr/results.rst +++ b/docs/source/asr/results.rst @@ -133,10 +133,56 @@ Often times, we want to transcribe a large number of files at once (maybe from a # process a batch of 32 results (or less if last batch does not contain 32 elements) .... +For more information, see `nemo.collections.asr.modules <./api.html#modules>`__. For more information on the general ``Transcription API``, please take a look at :class:`~nemo.collections.asr.parts.mixins.transcription.TranscriptionMixin`. The audio files should be 16KHz mono-channel wav files. ----- -For more information, see `nemo.collections.asr.modules <./api.html#modules>`__. For more information on the general ``Transcription API``, please take a look at :class:`~nemo.collections.asr.parts.mixins.transcription.TranscriptionMixin`. The audio files should be 16KHz mono-channel wav files. +Inference with Multi-task Models +^^^^^^^^^^^^^^^^^^^^^^ + +Multi-task models that use structured prompts require additionl task tokens as input, in which case it is recommended to use manifest as input. Below is an example of using the `nvidia/canary-1b` model: + +.. code-block:: python + from nemo.collections.asr.models import EncDecMultiTaskModel + + # load model + canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') + + # update dcode params + decode_cfg = canary_model.cfg.decoding + decode_cfg.beam.beam_size = 1 + canary_model.change_decoding_strategy(decode_cfg) + + # run transcribe + predicted_text = canary_model.transcribe( + "", + batch_size=16, # batch size to run the inference with + ) + +Here the manifest file should be a json file where each line has the following format: + +.. code-block:: bash + { + "audio_filepath": "/path/to/audio.wav", # path to the audio file + "duration": None, # duration of the audio in seconds, set to `None` to use full audio + "taskname": "asr", # use "ast" for speech-to-text translation + "source_lang": "en", # language of the audio input, set `source_lang`==`target_lang` for ASR + "target_lang": "en", # language of the text output + "pnc": "yes", # whether to have PnC output, choices=['yes', 'no'] + "answer": "na", # set to non-dummy strings to calculate WER/BLEU scores + } + +Note that using manifest allows to specify the task configuration for each audio individually. If we want to use the same task configuration for all the audio files, it can be specified in `transcribe` method directly. + +.. code-block:: python + canary_model.transcribe( + audio=[list of audio files], + batch_size=4, # batch size to run the inference with + task="asr", # use "ast" for speech-to-text translation + source_lang="en", # language of the audio input, set `source_lang`==`target_lang` for ASR + target_lang="en", # language of the text output + pnc=True, # whether to have PnC output, choices=[True, False] + ) Inference on long audio ^^^^^^^^^^^^^^^^^^^^^^^ @@ -180,6 +226,7 @@ Sometimes, the downsampling module at the earliest stage of the model can take m # Speedup conv subsampling factor to speed up the subsampling module. asr_model.change_subsampling_conv_chunking_factor(1) # 1 = auto select + .. note:: Only certain models which use depthwise separable convolutions in the downsampling layer support this operation. Please try it out on your model and see if it is supported. @@ -388,4 +435,4 @@ Code-Switching :file: data/benchmark_code_switching.csv :align: left :widths: 40, 10, 50 - :header-rows: 1 \ No newline at end of file + :header-rows: 1 From eeb0dd7ae1b3edb87770fcd8b984e6202ba1b2e5 Mon Sep 17 00:00:00 2001 From: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:15:43 -0800 Subject: [PATCH 008/140] bug fix in long-form transcription for canary (#8614) Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada --- nemo/collections/asr/parts/utils/streaming_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index d90fe2be981e..71c945b66255 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -1590,8 +1590,8 @@ def get_input_tokens(self, sample: dict): ) tokens = canary_prompt( tokenizer=self.asr_model.tokenizer, - text="none", - language=sample['target_lang'], + text=None, + language=None, source_language=sample['source_lang'], target_language=sample['target_lang'], taskname=sample['taskname'], @@ -1619,7 +1619,7 @@ def _get_batch_preds(self, keep_logits=False): tokens = self.input_tokens.to(device).repeat(feat_signal.size(0), 1) tokens_len = torch.tensor([tokens.size(1)] * tokens.size(0), device=device).long() - batch_input = (feat_signal, feat_signal_len, tokens, tokens_len) + batch_input = (feat_signal, feat_signal_len, None, None, tokens, tokens_len) predictions = self.asr_model.predict_step(batch_input, has_processed_signal=True) self.all_preds.extend(predictions) del predictions From 438db620bffdf4e2d4cef6368d0e86be2a02b7c3 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 8 Mar 2024 16:19:15 -0800 Subject: [PATCH 009/140] Fixes gpt mcore conversion to account for _extra_state that may be present (#8618) --- scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py index e152736734f6..8e2c2d350855 100644 --- a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py +++ b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py @@ -292,7 +292,9 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False, ignore_if_missing=t logging.info("✅ Weights match") # check for unexpected weights in state dict - assert len(nemo_state_dict) == 0, f"❌ unexpected items in nemo_state_dict: {nemo_state_dict}" + assert ( + len([k for k in nemo_state_dict if not k.endswith('_extra_state')]) == 0 + ), f"❌ unexpected items in nemo_state_dict: {nemo_state_dict}" assert ( len([k for k in mcore_state_dict if not k.endswith('_extra_state')]) == 0 ), f"❌ unexpected items in mcore_state_dict: {mcore_state_dict}" From 49c10c881c835725ae24ed115b6aefd2cb595e8e Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 11 Mar 2024 11:50:44 -0400 Subject: [PATCH 010/140] Fix LoRA SP no redundant gather + linear_fc1 lora logic (#8621) * remove LoRA SP no redundant comm for all linear layers Signed-off-by: Chen Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert to scatter in adapter module instead of scatter after add Signed-off-by: Chen Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Chen Cui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../modules/common/megatron/adapters/mcore_mixins.py | 12 ++++++++++-- .../common/megatron/adapters/parallel_adapters.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index 368d2cc52ae0..3eb63e96c3a3 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -248,16 +248,23 @@ def mcore_register_adapters(self): self.set_accepted_adapter_types( [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_] ) # only self attn (packed qkv) for now + self.linear_fc1.return_layernorm_output = True # need layernorm output for lora mlp + if self.config.sequence_parallel and hasattr(self.linear_fc1, "return_layernorm_output_gathered"): + # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need + # to perform the redundant gather in the adapter module. + self.linear_fc1.return_layernorm_output_gathered = True def forward(self, hidden_states): # [s, b, 4 * h/p] - intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) + linear_fc1_output, bias_parallel = self.linear_fc1(hidden_states) + + intermediate_parallel, layernorm_output = linear_fc1_output # LoRA logic if self.is_adapter_available(): lora_linear_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER) if lora_linear_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']: - lora_output = lora_linear_fc1_adapter(hidden_states) + lora_output = lora_linear_fc1_adapter(layernorm_output) intermediate_parallel = intermediate_parallel + lora_output if self.config.bias_activation_fusion: @@ -294,6 +301,7 @@ def glu(x): if lora_linear_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']: lora_output = lora_linear_fc2_adapter(intermediate_parallel) output = output + lora_output + return output, output_bias diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index d31125945f73..8c34f528f2d9 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -237,7 +237,7 @@ def __init__( # revert config change in case it is read elsewhere model_parallel_config.sequence_parallel = self._sequence_parallel - if self._sequence_parallel: + if self._sequence_parallel and not input_is_parallel: from importlib.metadata import version from pkg_resources import packaging From 1f6191ef0cccc1bf7e256448b85d59a9c43fb553 Mon Sep 17 00:00:00 2001 From: Zeeshan Patel Date: Mon, 11 Mar 2024 11:12:42 -0700 Subject: [PATCH 011/140] fixed pp eval for sft/lora (#8616) Co-authored-by: Chen Cui --- .../language_modeling/megatron_gpt_sft_model.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 331f977a3265..325f039d461b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -423,12 +423,15 @@ def inference_step(self, dataloader_iter, mode): self._inference_config['tokens_to_generate'] = data_cfg.get('tokens_to_generate') output = self.predict_step(batch, batch_idx, dataloader_idx) - inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']] - labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']] - preds_text = [ - self.tokenizer.ids_to_text(t[l.item() :][: data_cfg.get('tokens_to_generate')]) - for t, l in zip(output['token_ids'], batch['context_lengths']) - ] + if output: + inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']] + labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']] + preds_text = [ + self.tokenizer.ids_to_text(t[l.item() :][: data_cfg.get('tokens_to_generate')]) + for t, l in zip(output['token_ids'], batch['context_lengths']) + ] + else: + inputs_text, labels_text, preds_text = [], [], [] else: inputs_text, labels_text, preds_text = [], [], [] From f005f1323eaf9a23ad6dc4bc326dc95bf0002e8d Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Mon, 11 Mar 2024 20:40:36 -0700 Subject: [PATCH 012/140] Set precision None in megatron_ckpt_to_nemo.py (#8630) Signed-off-by: Abhishree --- examples/nlp/language_modeling/megatron_ckpt_to_nemo.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index c4c0394e3892..c58ae7f156eb 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -142,6 +142,9 @@ def convert(local_rank, rank, world_size, args): hysteresis=cfg.model.get('hysteresis', 2), ) plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size From ac0396f37350bf98f2887c224428e387d82dc155 Mon Sep 17 00:00:00 2001 From: Chris Alexiuk <161380339+chrisalexiuk-nvidia@users.noreply.github.com> Date: Tue, 12 Mar 2024 01:36:25 -0400 Subject: [PATCH 013/140] Minor Updates to GPT Training Documentation Example (#8629) Minor copy and instruction changes to improve tutorial viability. Signed-off-by: Chris Alexiuk <161380339+chrisalexiuk-nvidia@users.noreply.github.com> Co-authored-by: Eric Harper --- .../nlp/nemo_megatron/gpt/gpt_training.rst | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst index 4c0a09b7f6ea..986e7be30a00 100644 --- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst +++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst @@ -6,7 +6,10 @@ GPT is a decoder-only Transformer model. Quick start ^^^^^^^^^^^ -Steps below demonstrate training of a GPT style model with NeMo +The steps below demonstrate training of a GPT-style model with NeMo + +.. note:: + This example is best completed using the latest NeMo Framework NGC Container Data download & pre-processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -16,7 +19,7 @@ Data download & pre-processing **Step 1: Download data** -The step below will download Wikipedia data (around 20GB) and can take some several hours. +The step below will download Wikipedia data (around 20GB) and can take several hours. .. code-block:: bash @@ -35,12 +38,13 @@ Now, ``train_data.jsonl`` will contain our training data in the json line format **Step 3: Train tokenizer** -Below we will condider 2 options for training data tokenizers: Using pre-built HuggingFace BPE and training and using your own Google Sentencepiece tokenizer. -Note that only second option allows you to experiment with vocabulary size. +Below we will consider 2 options for training data tokenizers: Using pre-built HuggingFace BPE and training and using your own Google Sentencepiece tokenizer. + +Note that only the second option allows you to experiment with vocabulary size. *Option 1:* Using HuggingFace GPT2 tokenizer files. -With this option we will just download pre-built vocabulary and merge files for BPE tokenizer. +With this option, we will download a pre-built vocabulary and merge the files for the BPE tokenizer. .. code-block:: bash @@ -50,7 +54,7 @@ With this option we will just download pre-built vocabulary and merge files for *Option 2:* Using `Google Sentencepiece `_ tokenizer library. -It comes as dependency with NeMo, so if you have installed NeMo it should already be installed. +It comes as a dependency with NeMo, so if you have installed NeMo it should already be installed. Note that training tokenizer model will also take some time. .. code-block:: bash @@ -66,11 +70,11 @@ Note that training tokenizer model will also take some time. --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \ --split_digits true -After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model and spm_32k_wiki.vocab`` which correspond to model and vocabulary. +After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model``` and ```spm_32k_wiki.vocab``corresponding to the model and vocabulary. **Step 4: Convert training data into memory map format** -This format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using tokenizer model from Step 3. +This format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using the tokenizer model from Step 3. *Option 1:* Using HuggingFace GPT2 tokenizer files. @@ -106,15 +110,15 @@ Train GPT-style Model ~~~~~~~~~~~~~~~~~~~~~ Once you have prepared training data and tokenizer, you are ready to train the model. -The configuration we present below has about 124M parameters and it should fit on a single 16GB GPU if using float16. +The configuration we present below has about 124M parameters and should fit on a single 16GB GPU using float16. Let's go!!! *Option 1:* Using HuggingFace GPT2 tokenizer files. .. code-block:: bash - python /home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - --config-path=/home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/conf \ + python /examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + --config-path=/examples/nlp/language_modeling/conf \ --config-name=megatron_gpt_config \ trainer.devices=1 \ trainer.num_nodes=1 \ @@ -166,8 +170,8 @@ Let's go!!! .. code-block:: bash - python /home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - --config-path=/home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/conf \ + python /examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + --config-path=/examples/nlp/language_modeling/conf \ --config-name=megatron_gpt_config \ trainer.devices=1 \ trainer.num_nodes=1 \ @@ -215,7 +219,7 @@ Let's go!!! exp_manager.checkpoint_callback_params.always_save_nemo=False -Next, simply launch Tensorboard to monitor training like so: +Next, you can launch Tensorboard to monitor training like so: .. code-block:: bash From e46f71117011739603f38eacdbc3acd7e7904074 Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Mon, 11 Mar 2024 23:04:57 -0700 Subject: [PATCH 014/140] remove include intro from docs index (#8636) Signed-off-by: Elena Rastorgueva --- docs/source/index.rst | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 822431a9108a..9b62174ecbe2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,26 +1,44 @@ NVIDIA NeMo Framework Developer Docs ==================================== -.. include:: starthere/intro.rst +NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customize, and deploy generative AI models anywhere. +`NVIDIA NeMo Framework `_ has separate collections for: + +* :doc:`Large Language Models (LLMs) ` + +* :doc:`Automatic Speech Recognition (ASR) ` + +* :doc:`Multimodal (MM) Models ` + +* :doc:`Text-to-Speech (TTS) ` + +* :doc:`Computer Vision (CV) ` + +Each collection consists of prebuilt modules that include everything needed to train on your data. +Every module can easily be customized, extended, and composed to create new generative AI +model architectures. + +For quick guides and tutorials, see the "Getting started" section below. -Index of NeMo Framework Developer Docs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. toctree:: :maxdepth: 2 :caption: Getting Started :name: starthere + :titlesonly: starthere/intro starthere/tutorials starthere/best-practices +For more information, browse the developer docs for your area of interest in the contents section below or on the left sidebar. .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: NeMo Core :name: core + :titlesonly: core/core core/exp_manager @@ -31,9 +49,10 @@ Index of NeMo Framework Developer Docs .. toctree:: - :maxdepth: 3 + :maxdepth: 1 :caption: Large Language Models (LLMs) :name: Large Language Models + :titlesonly: nlp/nemo_megatron/intro nlp/models @@ -43,9 +62,10 @@ Index of NeMo Framework Developer Docs .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Speech AI :name: Speech AI + :titlesonly: asr/intro asr/speech_classification/intro @@ -56,9 +76,10 @@ Index of NeMo Framework Developer Docs .. toctree:: - :maxdepth: 3 + :maxdepth: 1 :caption: Multimodal (MM) :name: Multimodal + :titlesonly: multimodal/mllm/intro multimodal/vlm/intro @@ -71,6 +92,7 @@ Index of NeMo Framework Developer Docs :maxdepth: 1 :caption: Text To Speech (TTS) :name: Text To Speech + :titlesonly: tts/intro @@ -78,6 +100,7 @@ Index of NeMo Framework Developer Docs :maxdepth: 2 :caption: Vision (CV) :name: vision + :titlesonly: vision/intro @@ -85,14 +108,16 @@ Index of NeMo Framework Developer Docs :maxdepth: 2 :caption: Common :name: Common + :titlesonly: common/intro .. toctree:: - :maxdepth: 3 + :maxdepth: 2 :caption: Speech Tools :name: Speech Tools + :titlesonly: tools/intro @@ -100,5 +125,6 @@ Index of NeMo Framework Developer Docs :maxdepth: 2 :caption: Upgrade Guide :name: Upgrade Guide + :titlesonly: starthere/migration-guide \ No newline at end of file From 6daf5e88e7016d537cede484cefbeecec2c391fe Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:31:05 -0400 Subject: [PATCH 015/140] Fix for relative file paths when presort_manifest==True (#8639) Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --- examples/asr/transcribe_speech.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 6d6006e939e5..e85a15be81d4 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -40,6 +40,7 @@ transcribe_partial_audio, write_transcription, ) +from nemo.collections.common.parts.preprocessing.manifest import get_full_path from nemo.core.config import hydra_runner from nemo.utils import logging @@ -331,6 +332,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis if cfg.presort_manifest: with NamedTemporaryFile("w", suffix=".json", delete=False) as f: for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=True): + item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest) print(json.dumps(item), file=f) cfg.dataset_manifest = f.name remove_path_after_done = f.name From bab2a39467db2a489c5104af264f0d9575a6db6d Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Tue, 12 Mar 2024 14:50:06 -0700 Subject: [PATCH 016/140] Gemma uses openai_gelu approx (#8638) Signed-off-by: yaoyu-33 --- examples/nlp/language_modeling/conf/megatron_gemma_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml b/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml index cda2162002d3..bdc5e2057886 100644 --- a/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml @@ -76,7 +76,7 @@ model: activation: 'geglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] - openai_gelu: False # Use OpenAI's GELU instead of the default GeLU + openai_gelu: True # Use OpenAI's GELU instead of the default GeLU normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope'] rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. From cb3f2bc748c5f4545a94f7aa32ec0e6576af9b7c Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Wed, 13 Mar 2024 16:28:56 +0100 Subject: [PATCH 017/140] AMMO Integration with Llama2 Post-Training Quantization Example and Tests (#8444) * AMMO integration with Llama2 PTQ example and tests Signed-off-by: Jan Lasek * Jenkins megatron_llama_quantization.py test setup Signed-off-by: Jan Lasek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * License headers Signed-off-by: Jan Lasek * Add AMMO to requirements_nlp.txt with --extra-index-url for pip install Signed-off-by: Jan Lasek * Bump AMMO version to latest Signed-off-by: Jan Lasek * Guards workaround on spec definition Signed-off-by: Jan Lasek * Save artifacts and tokenizer config at once Signed-off-by: Jan Lasek * Extend nemo.utils package with new tools Signed-off-by: Jan Lasek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reorganize & reformat Signed-off-by: Jan Lasek * Tests for FP8 and INT4 AWQ Signed-off-by: Jan Lasek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add load_config helper function Signed-off-by: Jan Lasek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Unused import removal Signed-off-by: Jan Lasek * Fix FP8 Jenkins test Signed-off-by: Jan Lasek * Fix TP=2 test cont'd: no need to use mpirun Signed-off-by: Jan Lasek * Allow for patches in AMMO versioning Signed-off-by: Jan Lasek * Drop AWQ test for now (need to debug) Signed-off-by: Jan Lasek * Allow for patches in AMMO versioning cont'd Signed-off-by: Jan Lasek * Use AMMO spec from MCore as it has been published Signed-off-by: Jan Lasek * Make AMMO optional dependency and properly import guard it Signed-off-by: Jan Lasek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add Llama2 AWQ test and update some paths Signed-off-by: Jan Lasek * Enable specifying quantization.algorithm=null for baseline accuracy checks Signed-off-by: Jan Lasek * Enable exporting qnemo tarball or just to a directory Signed-off-by: Jan Lasek * Drop AWQ testing for now Signed-off-by: Jan Lasek * Test case for export.inference_tensor_parallel=2 Signed-off-by: Jan Lasek * Flag to export TRT-LLM config.json Signed-off-by: Jan Lasek --------- Signed-off-by: Jan Lasek Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Dockerfile | 4 +- Jenkinsfile | 65 +++++- .../conf/megatron_llama_quantization.yaml | 38 +++ .../megatron_llama_quantization.py | 93 ++++++++ .../language_modeling/megatron_gpt_model.py | 2 + nemo/export/__init__.py | 13 ++ nemo/export/quantize/__init__.py | 15 ++ nemo/export/quantize/quantizer.py | 218 ++++++++++++++++++ nemo/utils/distributed.py | 23 ++ nemo/utils/model_utils.py | 49 ++++ tests/setup/__main__.py | 42 ++++ tests/setup/data/create_sample_jsonl.py | 58 +++++ tests/setup/models/create_hf_model.py | 94 ++++++++ 13 files changed, 710 insertions(+), 4 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml create mode 100644 examples/nlp/language_modeling/megatron_llama_quantization.py create mode 100644 nemo/export/__init__.py create mode 100644 nemo/export/quantize/__init__.py create mode 100644 nemo/export/quantize/quantizer.py create mode 100644 tests/setup/__main__.py create mode 100644 tests/setup/data/create_sample_jsonl.py create mode 100644 tests/setup/models/create_hf_model.py diff --git a/Dockerfile b/Dockerfile index 90c84ea07627..970c34a690d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,7 +66,7 @@ WORKDIR /workspace/ # We leave it here in case we need to work off of a specific commit in main RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout ad53b1e38689a0ceed75ade7821f4e6c7554abb4 && \ + git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \ pip install . # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771 @@ -132,6 +132,8 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec RUN pip install flash-attn # install numba for latest containers RUN pip install numba>=0.57.1 +# install ammo +RUN pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir # copy nemo source into a scratch image FROM scratch as nemo-src diff --git a/Jenkinsfile b/Jenkinsfile index cfd5853a6882..100a0bd4a6ad 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -91,11 +91,17 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 5f9c870f9f24b482509699d206a9dbb00958f6fc && \ + git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \ pip install .' } } + stage('AMMO installation') { + steps { + sh 'pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir' + } + } + stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' @@ -390,6 +396,12 @@ pipeline { } } + stage('Setup test data and models') { + steps { + sh 'python -m tests.setup --save_dir /home/TestData/nlp' + } + } + // TODO: this requires TE >= v0.11 which is not available in 23.06. // please uncomment this test once mcore CI is ready. stage('L2: Community LLM Checkpoints tests') { @@ -405,9 +417,8 @@ pipeline { steps { sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \ --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \ - --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \ + --out-file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ --precision=16' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' } } stage('StarCoder') { @@ -439,6 +450,54 @@ pipeline { } } + stage('L2: Nemo PTQ') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('Llama2 - Export Only') { + steps { + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ + quantization.algorithm=null \ + model_save=/home/TestData/nlp/megatron_llama/ci_baseline' + sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline' + } + } + stage('Llama2 - INT8 SQ') { + steps { + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=int8_sq \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo' + } + } + stage('Llama2 - FP8') { + steps { + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ + tensor_model_parallel_size=2 \ + trainer.devices=2 \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=fp8 \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + export.inference_tensor_parallel=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo' + } + } + } + } + stage('L2: ASR dev run') { when { anyOf { diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml new file mode 100644 index 000000000000..f3803dc4e69c --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml @@ -0,0 +1,38 @@ +inference: + greedy: false # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + add_BOS: true # add the bos token at the begining of the prompt + tokens_to_generate: 30 # The minimum length of the sequence to be generated. + all_probs: false # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: false # a flag used to compute logprob of all the input text, a very special case of running inference, default False + batch_size: 64 # batch size for inference + max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: false # logger provided by exp_manager + precision: bf16 # 16, 32, or bf16 + enable_checkpointing: false + +quantization: + quantize_bmm1: false + algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null + calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail + num_calib_size: 512 # number of samples used for calibration + +export: + decoder_type: llama # gptnext, gpt2, llama + inference_tensor_parallel: 1 # Default using 1 TP for inference + dtype: 16 # Default precision data type + export_tensorrt_llm_config: true # export config to build TRT-LLM engine directly + +model_file: llama2-7b-fp16.nemo # Nemo file path +model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py new file mode 100644 index 000000000000..16fb5ae9c13b --- /dev/null +++ b/examples/nlp/language_modeling/megatron_llama_quantization.py @@ -0,0 +1,93 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.multiprocessing as mp +from datasets import load_dataset + +from nemo.core.config import hydra_runner +from nemo.export.quantize import Quantizer + +mp.set_start_method("spawn", force=True) + +""" +Nemo quantization example script. + +Please consult nemo.export.quantize.Quantizer class +and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods, +models supported as well as how to set up data and inference for calibration (with defaults recommended). + +Example usage: +``` +python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=llama2-7b-fp16.nemo \ + model_save=llama2-7b-fp8.qnemo \ + quantization.algorithm=fp8 \ + export.decoder_type=llama \ + export.inference_tensor_parallel=1 +``` +""" + + +def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512): + if data == "pileval": + dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") + text_column = "text" + elif data == "wikitext": + dataset = load_dataset("wikitext", "wikitext-103-v1", split="train") + text_column = "text" + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + text_column = "article" + else: + # Assume a local JSON dataset with a column named "text" + dataset = load_dataset("json", data_files=data, split="train") + text_column = "text" + calib_size = max(min(len(dataset), calib_size), batch_size) + for i in range(calib_size // batch_size): + batch = dataset[i * batch_size : (i + 1) * batch_size][text_column] + for j in range(len(batch)): + batch[j] = batch[j][:max_sequence_length] + yield batch + + +@hydra_runner(config_path="conf", config_name="megatron_llama_quantization") +def main(cfg) -> None: + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for the inference.") + + quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer) + + # Quantization algorithm can be set to None. This is useful for baseline precision + # accuracy validation. In this case only weights export step will be performed: + if cfg.quantization.algorithm is not None: + dataloader = get_calib_dataloader( + cfg.quantization.calib_dataset, + cfg.inference.batch_size, + cfg.quantization.num_calib_size, + cfg.inference.max_context_length, + ) + dataloader = [data for data in dataloader] + else: + dataloader = None + + model = quantizer.quantize( + cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size + ) + + quantizer.export(model, cfg.model_save) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index ac35af38de64..f883f1c1fc7c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -91,6 +91,7 @@ from megatron.core import InferenceParams, parallel_state, tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset + from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec from megatron.core.models.gpt import GPTModel as MCoreGPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.pipeline_parallel.schedules import get_forward_backward_func @@ -140,6 +141,7 @@ def get_specs(spec_name, num_experts=None): "": get_gpt_layer_with_transformer_engine_spec(num_experts), "megatron_falcon_gpt": get_falcon_layer_spec(), "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(), + "ammo": get_gpt_layer_ammo_spec(), } if spec_name not in name_spec_dict: raise ValueError(f"Spec name '{spec_name}' is not recognized.") diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py new file mode 100644 index 000000000000..d9155f923f18 --- /dev/null +++ b/nemo/export/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py new file mode 100644 index 000000000000..87812e621bb6 --- /dev/null +++ b/nemo/export/quantize/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .quantizer import Quantizer diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py new file mode 100644 index 000000000000..1ae375e6cfe7 --- /dev/null +++ b/nemo/export/quantize/quantizer.py @@ -0,0 +1,218 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import tarfile +from contextlib import nullcontext +from typing import List, Optional + +import torch.distributed as dist +from megatron.core import parallel_state +from omegaconf import OmegaConf +from omegaconf.omegaconf import DictConfig, open_dict +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision +from nemo.utils import logging +from nemo.utils.distributed import temporary_directory +from nemo.utils.get_rank import is_global_rank_zero +from nemo.utils.model_utils import load_config, save_artifacts + +try: + import ammo.torch.quantization as atq + from ammo.torch.export import export_model_config + + HAVE_AMMO = True + +except (ImportError, ModuleNotFoundError) as e: + HAVE_AMMO = False + HAVE_AMMO_ERROR = e + + +class Quantizer: + + """ + Post-training quantization of Nemo checkpoints. + + PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving. + The process consist of several steps: + + 1. Loading a Nemo model from disk using appropriate parallelism strategy + 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors + 3. Producing output directory or .qnemo tarball with model config (json), + quantized weights (safetensors) and tokenizer config (yaml). + + The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox + for efficient inference. This can be achieved using Nemo inference containers. + + Currently supported and tested model family is Llama2. Model type needs to be specified in + the quantization command with decoder_type parameter on exporting (see below). Quantizing other + model families is experimental and might not be fully supported. + + Available quantization methods are listed in QUANT_CFG_CHOICES dictionary below. + Please consult AMMO documentation for details. You can also inspect different choices in + examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and + calibration data as well as recommended settings. + + Quantization algorithm can also be conveniently set to 'null' to perform only weights export step + for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model. + """ + + def __init__( + self, + quantization_config: DictConfig, + inference_config: DictConfig, + export_config: DictConfig, + trainer_config: DictConfig, + ): + if not HAVE_AMMO: + raise RuntimeError("nvidia-ammo>=0.7 is needed to use Quantizer") from HAVE_AMMO_ERROR + QUANT_CFG_CHOICES = { + "int8": atq.INT8_DEFAULT_CFG, + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, + "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, + } + SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers + assert export_config.dtype in SUPPORTED_DTYPE + assert quantization_config.algorithm is None or quantization_config.algorithm in QUANT_CFG_CHOICES + self.quantization_config = quantization_config + self.inference_config = inference_config + self.export_config = export_config + self.trainer_config = trainer_config + if quantization_config.algorithm is not None: + atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm] + if quantization_config.algorithm != "fp8": + # disable quantization for the last output layer + atq_config = copy.deepcopy(atq_config) + atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False} + self.atq_config = atq_config + else: + self.atq_config = None + + def _load_model( + self, + model_file: str, + tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, + ): + """Load model using AMMO layer spec for quantization.""" + model_cfg = self._load_and_modify_config(model_file, tensor_model_parallel_size, pipeline_model_parallel_size) + + trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config) + connector = NLPSaveRestoreConnector() + + model = MegatronGPTModel.restore_from( + restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector, + ) + model.freeze() + + try: + model.model.module.language_model.encoder.activations_checkpoint_method = None + except AttributeError: + pass + + self._check_ddp_initialized(model) + + if is_global_rank_zero(): + print(model) + + return model + + def _check_ddp_initialized(self, model): + if parallel_state.is_unitialized(): + + def dummy(): + return + + if model.trainer.strategy.launcher is not None: + model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) + model.trainer.strategy.setup_environment() + + def _load_and_modify_config( + self, + model_file: str, + tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, + ): + model_cfg = load_config(model_file) + + with open_dict(model_cfg): + model_cfg.activations_checkpoint_method = None + model_cfg.activations_checkpoint_granularity = None + if tensor_model_parallel_size is not None: + model_cfg.tensor_model_parallel_size = tensor_model_parallel_size + if pipeline_model_parallel_size is not None: + model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size + # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM + # layer definitions to avoid Transformer Engine implementations that are currently not supported. + model_cfg.name = "ammo" + + return model_cfg + + def quantize( + self, + model_file: str, + dataloader: Optional[List[List[str]]], + tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, + ): + """Quantize model checkpoint using given dataloader and optional custom parallelism settings.""" + model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size) + + if self.quantization_config.algorithm is None: + return model + + model.set_inference_config(OmegaConf.to_container(self.inference_config)) + + def forward_loop(): + for i, batch in enumerate(dataloader): + if is_global_rank_zero(): + print(f"Calibrating batch {i}") + model.predict_step(batch, i) + + model = atq.quantize(model, self.atq_config, forward_loop) + return model + + def export(self, model, model_save: str): + """Export model to '.qnemo' format for TensorRT-LLM engine build.""" + torch_dtype = torch_dtype_from_precision(self.export_config.dtype) + + # Setup model export handling: temporary directory for + # '.qnemo' tarball or directly write to model_save + save_qnemo = model_save.endswith(".qnemo") + if save_qnemo: + export_handler = temporary_directory() + else: + export_handler = nullcontext(enter_result=model_save) + + with export_handler as export_dir: + export_model_config( + model=model, + decoder_type=self.export_config.decoder_type, + dtype=torch_dtype, + export_dir=export_dir, + inference_tensor_parallel=self.export_config.inference_tensor_parallel, + export_tensorrt_llm_config=self.export_config.export_tensorrt_llm_config, + ) + dist.barrier() # Wait until all ranks complete export_model_config step + if is_global_rank_zero(): + logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...") + save_artifacts(model, export_dir) + if save_qnemo: + with tarfile.open(model_save, "w:gz") as tar: + tar.add(export_dir, arcname="./") diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py index b0d24de3e5b4..ee6c107b1d85 100644 --- a/nemo/utils/distributed.py +++ b/nemo/utils/distributed.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import os +import tempfile import torch +import torch.distributed as dist from nemo.utils import logging +from nemo.utils.get_rank import is_global_rank_zero try: from megatron.core import parallel_state @@ -100,3 +104,22 @@ def gather_objects(partial_results_list, main_rank=None): results_list.extend(r) return results_list + + +@contextlib.contextmanager +def temporary_directory(): + """Create a shared temporary directory across ranks in distributed setup. + + This function assumes that the distributed setup has been already + correctly initialized. It is intended to be used only in single-node + setup so that all ranks can access the directory created.""" + + if is_global_rank_zero(): + tmp_dir = [tempfile.TemporaryDirectory()] + else: + tmp_dir = [None] + dist.broadcast_object_list(tmp_dir) + yield tmp_dir[0].name + # We use barrier below to make sure that rank zero won't exit + # and delete tmp_dir while other ranks may still use it + dist.barrier() diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py index b2a6abbf54aa..8889f13d5b98 100644 --- a/nemo/utils/model_utils.py +++ b/nemo/utils/model_utils.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import copy import importlib import os +import shutil +import tarfile +import tempfile from dataclasses import dataclass, is_dataclass from enum import Enum from functools import lru_cache @@ -61,6 +65,18 @@ class ArtifactItem: hashed_path: Optional[str] = None +def load_config(model_file: str) -> DictConfig: + """Load model config from extracted directory or '.nemo' tarball.""" + if os.path.isfile(model_file): + with tempfile.TemporaryDirectory() as tmp, tarfile.open(model_file, "r:") as tar: + tar.extract("./model_config.yaml", path=tmp) + model_config = OmegaConf.load(os.path.join(tmp, "model_config.yaml")) + else: + model_config = OmegaConf.load(os.path.join(model_file, "model_config.yaml")) + + return model_config + + def resolve_dataset_name_from_cfg(cfg: 'DictConfig') -> Optional[str]: """ Parses items of the provided sub-config to find the first potential key that @@ -636,3 +652,36 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path: checkpoint_dir = filepath.with_name(filepath.stem) return checkpoint_dir + + +def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None: + """Save all model artifacts and tokenizer config to a given output directory.""" + app_state = AppState() + model_file = app_state.model_restore_path + model_cfg = copy.deepcopy(model.cfg) + + # Setup model file handling context: directory or tarball + if os.path.isfile(model_file): + model_file_handler = tarfile.open + kwargs = {"name": model_file, "mode": "r:"} + elif os.path.isdir(model_file): + model_file_handler = contextlib.nullcontext + kwargs = {} + else: + raise FileNotFoundError(model_file) + + # Copy or extract artifacts depending on the context + with model_file_handler(**kwargs) as maybe_tar: + for arti_name, arti_item in model.artifacts.items(): + _, arti_file = arti_item.path.split("nemo:") + arti_path = os.path.join(output_dir, arti_name) + if maybe_tar is not None: + maybe_tar.extract(f"./{arti_file}", path=output_dir) + os.rename(os.path.join(output_dir, arti_file), arti_path) + else: + shutil.copy(os.path.join(model_file, arti_file), arti_path) + # Store artifact path as basename by default. Otherwise save absolute path but bear in mind + # that in this case output directory should be permanent for correct artifact recovery later + arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path) + OmegaConf.update(model_cfg, arti_name, arti_path) + OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml")) diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py new file mode 100644 index 000000000000..289a2537e2f2 --- /dev/null +++ b/tests/setup/__main__.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +from .data.create_sample_jsonl import create_sample_jsonl +from .models.create_hf_model import create_hf_model + +print("Setup test data and models...") + +parser = argparse.ArgumentParser("Setup test data and models.") +parser.add_argument("--save_dir", required=True, help="Root save directory for artifacts") +parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files and directories") +args = parser.parse_args() + +print(f"Arguments are: {vars(args)}") + +os.makedirs(args.save_dir, exist_ok=True) + +create_sample_jsonl( + output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite, +) + +create_hf_model( + model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf", + output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"), + config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}, + overwrite=args.overwrite, +) +print("Setup done.") diff --git a/tests/setup/data/create_sample_jsonl.py b/tests/setup/data/create_sample_jsonl.py new file mode 100644 index 000000000000..00f789548f81 --- /dev/null +++ b/tests/setup/data/create_sample_jsonl.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os + +""" +Create sample JSONL file for functional testing. Each line contains a dictionary +with a single element "text" for storing data. +""" + + +def create_sample_jsonl(output_file: str, overwrite: bool = False): + """Create sample JSONL.""" + if os.path.isfile(output_file) and not overwrite: + print(f"File {output_file} exists and overwrite flag is not set so exiting.") + return + + texts = [ + "Sample data for functional tests", + "Once upon a time, in the middle of a dense forest, there was a small house, where lived a pretty little girl " + "named Little Red Riding Hood.", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore " + "magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea " + "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat " + "nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit " + "anim id est laborum...", + "Next please!", + "¡H E L L O W O R L D!", + "Korzystając z okazji chciałbym pozdrowić całą moją rodzinę i przyjaciół", + ] + print(f"Writing {len(texts)} line(s) to {output_file}...") + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, mode="w", encoding="utf-8") as f: + for text in texts: + json.dump({"text": text}, f) + f.write("\n") + print("OK.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Create sample JSONL file.") + parser.add_argument("--output_file", help="Output file name") + parser.add_argument("--overwrite", action="store_true", help="Overwrite file if it exists") + args = parser.parse_args() + create_sample_jsonl(args.output_file) diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py new file mode 100644 index 000000000000..9f57d5996dfc --- /dev/null +++ b/tests/setup/models/create_hf_model.py @@ -0,0 +1,94 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os + +from typing import Any, Dict, Optional + +import transformers + +""" +Create a randomly initialized HuggingFace model for testing purposes. + +Model can be specified by name or path for creating its config and tokenizer using +HuggingFace transformers AutoConfig and AutoTokenizer functions. + +Parameter config_updates can be used to override specific model config fields to make +it smaller, for example, by changing number of layers or hidden layers dimensionality, +making it adequate for testing purposes. This parameter should be specified as +a dictionary that can be parsed using json.loads method. + +Example usage for Llama2 model (requires HF login): +``` +python tests/setup/models/create_tiny_hf_model.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --output_dir tiny_llama2_hf \ + --config_updates '{"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}' +``` +""" + + +def get_hf_model_class(hf_config): + """Get HuggingFace model class from config.""" + if len(hf_config.architectures) > 1: + print(f"More than one model architecture available, choosing 1st: {hf_config.architectures}") + model_name = hf_config.architectures[0] + model_class = getattr(transformers, model_name) + return model_class + + +def create_hf_model( + model_name_or_path: str, output_dir: str, config_updates: Optional[Dict[str, Any]] = None, overwrite: bool = False +): + """Create HuggingFace model with optional config updates.""" + if os.path.isdir(output_dir) and not overwrite: + print(f"Output directory {output_dir} exists and overwrite flag is not set so exiting.") + return + + hf_config = transformers.AutoConfig.from_pretrained(model_name_or_path) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path) + model_class = get_hf_model_class(hf_config) + + if config_updates is not None: + hf_config.update(config_updates) + print(hf_config) + + model = model_class(hf_config) + print(model) + + os.makedirs(output_dir, exist_ok=True) + print(f"Saving model to {output_dir}...") + tokenizer.save_pretrained(output_dir) + model.save_pretrained(output_dir) + print("OK.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.") + parser.add_argument( + "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer", + ) + parser.add_argument( + "--output_dir", required=True, help="Output directory", + ) + parser.add_argument( + "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config", + ) + parser.add_argument( + "--overwrite", action="store_true", help="Overwrite file if it exists", + ) + args = parser.parse_args() + create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates) From fba71c0977a747444de3e08f0e38d812128ddf00 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Wed, 13 Mar 2024 14:14:36 -0400 Subject: [PATCH 018/140] fix FIM RNG issue (#8513) * fix FIM RNG issue * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix FIMDataset * fix seed ref * fim fix Signed-off-by: dimapihtar * add fim test Signed-off-by: dimapihtar * remove files Signed-off-by: dimapihtar * remove swp Signed-off-by: dimapihtar * remove import Signed-off-by: dimapihtar * fix syntax Signed-off-by: dimapihtar * fix Jenkins Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: dimapihtar --- Jenkinsfile | 55 ++++++++++++++++++- .../megatron/gpt_fim_dataset.py | 9 +-- .../language_modeling/megatron_gpt_model.py | 2 +- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 100a0bd4a6ad..602c78890262 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -5273,7 +5273,60 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' } } - + stage('L2: Megatron FIM Dataset') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=1 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=layernorm1p \ + model.bias_activation_fusion=True \ + model.bias_dropout_add_fusion=True \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=null \ + model.activations_checkpoint_granularity=null \ + model.activations_checkpoint_num_layers=null \ + model.data.data_prefix='[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]' \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ + ++model.data.add_fim=True \ + ++model.data.fim.extra_tokens.prefix='fim_prefix' \ + ++model.data.fim.extra_tokens.middle='fim_middle' \ + ++model.data.fim.extra_tokens.suffix='fim_suffix' \ + ++model.data.fim.extra_tokens.pad='fim_pad' \ + ++model.data.fim.extra_tokens.eod='endoftext'" + sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" + } + } + stage('L2: Megatron Mock Data Generation') { when { anyOf { diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py index 49a34a368fdc..17576bea4c75 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py @@ -29,13 +29,11 @@ class GPTFIMDatasetConfig(GPTDatasetConfig): """Configuration object for Megatron Core GPT FIM datasets Attributes: - tokenizer: model tokenizer fim: fill in the middle parameters config """ - def __init__(self, tokenizer, fim, **kwargs): + def __init__(self, fim, **kwargs): super().__init__(**kwargs) - self.tokenizer = tokenizer self.fim = fim @@ -58,12 +56,15 @@ class GPTFIMDataset(GPTDataset): def __init__( self, indexed_dataset: MMapIndexedDataset, + dataset_path: str, indexed_indices: np.ndarray, num_samples: int, index_split: Split, config: GPTFIMDatasetConfig, ) -> None: - super().__init__(indexed_dataset, indexed_indices, num_samples, index_split, config) + super().__init__(indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config) + + self.indexed_dataset = indexed_dataset def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray, np.ndarray]: """Get the text (token ids) and document ids for a given index diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index f883f1c1fc7c..79d48269d3a6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1286,7 +1286,7 @@ def build_train_valid_test_datasets(self): kwargs["split"] = self.cfg.data.splits_string if self.cfg.data.get('add_fim', False): - dataset_config = GPTFIMDatasetConfig(self.tokenizer, self.cfg.data.fim, **kwargs) + dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs) self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( GPTFIMDataset, train_valid_test_num_samples, dataset_config, From 48b8204d57e59c8790aaa6eaa20384b046b1a574 Mon Sep 17 00:00:00 2001 From: Aditya Malte Date: Wed, 13 Mar 2024 15:43:58 -0700 Subject: [PATCH 019/140] Add support to perform "inference-only" without loading training data (#8640) * Add support to perform "inference-only" without loading training data Hi, Currently, the MegatronSBERT model cannot run inference. Essentially, a user may not be able to simply load a trained .nemo checkpoint and run inference (forward()) function on it. This patch adds a try/except block to handle cases where training data is not specified Signed-off-by: Aditya Malte * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Aditya Malte Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../megatron_sbert_model.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py index 0d312845db58..a9bb7fd40017 100644 --- a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py +++ b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py @@ -391,15 +391,23 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0)) softmax_temp = cfg.get('softmax_temp', 0.05) self.scale = 1.0 / softmax_temp - train_file_path = self.cfg.data.data_prefix - with open(train_file_path) as f: - train_data = json.load(f) - - random_seed = 42 - set_seed(random_seed) - random.shuffle(train_data) - - self.train_data = train_data + try: + train_file_path = self.cfg.data.data_prefix + with open(train_file_path) as f: + train_data = json.load(f) + + random_seed = 42 + set_seed(random_seed) + random.shuffle(train_data) + + self.train_data = train_data + logging.warning("Model is running in training mode") + except: + logging.warning( + "Model is running inference mode as training data is not specified, or could not be loaded" + ) + random_seed = 42 + set_seed(random_seed) def model_provider_func(self, pre_process, post_process): cfg = self.cfg From 1baaff7567508127dd3778eed77e185b8cf10dc0 Mon Sep 17 00:00:00 2001 From: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Date: Thu, 14 Mar 2024 19:32:48 +0400 Subject: [PATCH 020/140] Add ASR context-biasing tutorial (#8462) * add ctcws tutorial Signed-off-by: andrusenkoau * clear sell outputs Signed-off-by: andrusenkoau * fixes Signed-off-by: andrusenkoau * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: andrusenkoau * fixes Signed-off-by: andrusenkoau * fixes Signed-off-by: andrusenkoau --------- Signed-off-by: andrusenkoau Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../context_biasing/context_graph_ctc.py | 22 +- tutorials/asr/ASR_Context_Biasing.ipynb | 947 ++++++++++++++++++ tutorials/asr/README.md | 1 + 3 files changed, 963 insertions(+), 7 deletions(-) create mode 100644 tutorials/asr/ASR_Context_Biasing.ipynb diff --git a/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py b/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py index 5c9c3924625d..bcfcdf2435f1 100644 --- a/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py +++ b/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py @@ -179,22 +179,26 @@ def draw(self, title: Optional[str] = None, symbol_table: Optional[Dict[int, str "size": "8.5,11", "center": "1", "orientation": "Portrait", - "ranksep": "0.4", + "ranksep": "0.30", "nodesep": "0.25", } if title is not None: graph_attr["label"] = title + default_edge_attr = { + "fontsize": "12", + } + default_node_attr = { "shape": "circle", "style": "bold", - "fontsize": "14", + "fontsize": "12", } final_state_attr = { "shape": "doublecircle", "style": "bold", - "fontsize": "14", + "fontsize": "12", } dot = graphviz.Digraph(name="Context Graph", graph_attr=graph_attr) @@ -221,14 +225,18 @@ def draw(self, title: Optional[str] = None, symbol_table: Optional[Dict[int, str if node.index != current_node.index: output, input, arc = str(current_node.index), str(node.index), f"{label}" if (output, input, arc) not in printed_arcs: - dot.edge(output, input, label=arc) + if arc == self.blank_token: + dot.edge(output, input, label=self.blank_token, color="blue", **default_edge_attr) + else: + dot.edge(output, input, label=arc) queue.append(node) else: output, input, arc = str(current_node.index), str(current_node.index), f"{label}" if (output, input, arc) not in printed_arcs: - dot.edge( - output, input, label=arc, color="green", - ) + if arc == self.blank_token: + dot.edge(output, input, label=self.blank_token, color="blue", **default_edge_attr) + else: + dot.edge(output, input, label=arc, color="green") printed_arcs.add((output, input, arc)) return dot diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb new file mode 100644 index 000000000000..f001ce3d65a2 --- /dev/null +++ b/tutorials/asr/ASR_Context_Biasing.ipynb @@ -0,0 +1,947 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "17b3cbf8", + "metadata": {}, + "source": [ + "# Context-Biasing for ASR models with CTC-based Word Spotter" + ] + }, + { + "cell_type": "markdown", + "id": "1156d1d1", + "metadata": {}, + "source": [ + "This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo framework\n", + "for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter.\n", + "\n", + "## Tutorial content:\n", + "* Intro in the context-biasing problem\n", + "* Description of the proposed CTC-based Words Spotter (CTC-WS) method\n", + "* Practical part 1 (base):\n", + " * Download data set and ASR models\n", + " * Build context-biasing list\n", + " * Evaluate recognition results with and without context-biasing\n", + " * Improve context-biasing results with alternative transcriptions\n", + "* Practical part 2 (advanced):\n", + " * Visualization of context-biasing graph\n", + " * Running CTC-based Word Spotter only\n", + " * Merge greedy decoding results with spotted context-biasing candidates\n", + " * Results analysis\n", + "* Summary" + ] + }, + { + "cell_type": "markdown", + "id": "431edfbf", + "metadata": {}, + "source": [ + "## Context-biasing: intro\n", + "\n", + "ASR models often struggle to recognize words that were absent or had few examples in the training data.\n", + "This problem is especially acute due to the emergence of new names and titles in a rapidly developing world.\n", + "The users need to be able to recognize these new words.\n", + "Context-biasing methods attempt to solve this problem by assuming that we have a list of words and phrases (context-biasing list) in advance\n", + "for which we want to improve recognition accuracy.\n", + "\n", + "One of the directions of context-biasing methods is based on the `deep fusion` approach.\n", + "These methods require intervention into the ASR model and its training process.\n", + "The main disadvantage of these methods is that they require a lot of computational resources and time to train the model.\n", + "\n", + "Another direction is methods based on the `shallow fusion` approach. In this case, only the decoding process is modified.\n", + "During the beam-search decoding, the hypothesis is rescored depending on whether the current word is present in the context-biasing list or external language model.\n", + "The beam-search decoding may be computationally expensive, especially for the models with a large vocabulary and context-biasing list.\n", + "This problem is considerably worsened in the case of the Transducer (RNN-T) model since beam-search decoding involves multiple Decoder (Prediction) and Joint networks calculations.\n", + "Moreover, the context-biasing recognition is limited by the model prediction pool biased toward training data. In the case of rare or new words, the model may not have a hypothesis for the desired word from the context-biasing list whose probability we want to amplify." + ] + }, + { + "cell_type": "markdown", + "id": "ae0bfd60", + "metadata": {}, + "source": [ + "## CTC-based Word Spotter\n", + "\n", + "\n", + "This tutorial considers a fast context-biasing method using a CTC-based Word Spotter (CTC-WS).\n", + "The method involves decoding CTC log probabilities with a context graph built for words and phrases from the context-biasing list.\n", + "The spotted context-biasing candidates (with their scores and time intervals) are compared by scores with words from the greedy\n", + "CTC decoding results to improve recognition accuracy and pretend false accepts of context-biasing (Figure 1). \n", + " \n", + " \n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7e45bf2", + "metadata": {}, + "source": [ + "
\n", + " \"CTC-WS\" \n", + "
Figure 1. High-level representation of the proposed context-biasing method with CTC-WS in case of CTC model. Detected words (gpu, nvidia, cuda) are compared with words from the greedy CTC results in the overlapping intervals according to the accumulated scores to prevent false accept replacement.
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "ba163f41", + "metadata": {}, + "source": [ + "\n", + "\n", + " \n", + "A [Hybrid Transducer-CTC](https://arxiv.org/abs/2312.17279) model (a shared encoder trained together with CTC and Transducer output heads) enables the use of the CTC-WS method for the Transducer model.\n", + "Context-biasing candidates obtained by CTC-WS are also filtered by the scores with greedy CTC predictions and then merged with greedy Transducer results.\n", + "\n", + "The CTC-WS method allows using pretrained NeMo models (`CTC` or `Hybrid Transducer-CTC`) for context-biasing recognition without model retraining (Figure 2).\n", + "The method shows inspired results for context-biasing with only a little additional work time and computational resources.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c05b16d8", + "metadata": {}, + "source": [ + "
\n", + " \"CTC-WS\" \n", + "
Figure 2. Scheme of the context-biasing method with CTC-based Word Spotter. CTC-WS uses CTC log probabilities to detect context-biasing candidates. Obtained candidates are filtered by CTC word alignment and then merged with CTC or RNN-T word alignment to get the final text result.
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "ac0ec822", + "metadata": {}, + "source": [ + "# Installing dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69c86a4f", + "metadata": {}, + "outputs": [], + "source": [ + "BRANCH = 'main'\n", + "\n", + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "\n", + "import os\n", + "# either provide a path to local NeMo repository with NeMo already installed or git clone\n", + "\n", + "# option #1: local path to NeMo repo with NeMo already installed\n", + "NEMO_DIR_PATH = os.path.dirname(os.path.dirname(os.path.abspath('')))\n", + "\n", + "# check if Google Colab is being used\n", + "try:\n", + " import google.colab\n", + " IN_COLAB = True\n", + "except (ImportError, ModuleNotFoundError):\n", + " IN_COLAB = False\n", + "\n", + "# option #2: download NeMo repo\n", + "if IN_COLAB or not os.path.exists(os.path.join(NEMO_DIR_PATH, \"nemo\")):\n", + " ## Install dependencies\n", + " !apt-get install sox libsndfile1 ffmpeg\n", + "\n", + " !git clone -b $BRANCH https://github.com/NVIDIA/NeMo\n", + " %cd NeMo\n", + " !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", + " NEMO_DIR_PATH = os.path.abspath('')\n", + "\n", + "import sys\n", + "sys.path.insert(0, NEMO_DIR_PATH)" + ] + }, + { + "cell_type": "markdown", + "id": "5260d4fa", + "metadata": {}, + "source": [ + "## Practical part 1 (base)\n", + "In this part, we will consider the base usage of the CTC-WS method for pretrained NeMo models.\n", + "\n", + "### Data preparation.\n", + "We will use a subset of the GTC data set. The data set contains 10 audio files with NVIDIA GTC talks. \n", + "The primary data set feature is the computer science and engineering domain, which has a large number of unique terms and product names (NVIDIA, GPU, GeForce, Ray Tracing, Omniverse, teraflops, etc.), which is good fit for the context-biasing task. All the text data is normalized and lowercased." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "637f2c6d", + "metadata": {}, + "outputs": [], + "source": [ + "# download data\n", + "!wget https://asr-tutorial-data.s3.eu-north-1.amazonaws.com/context_biasing_data.gz\n", + "!tar -xvzf context_biasing_data.gz\n", + "!apt-get install tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6baefc80", + "metadata": {}, + "outputs": [], + "source": [ + "!tree context_biasing_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fe748b", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n", + "\n", + "# data is already stored in nemo data manifest format\n", + "test_nemo_manifest = \"./context_biasing_data/gtc_data_subset_10f.json\"\n", + "test_data = read_manifest(test_nemo_manifest)\n", + "\n", + "for idx, item in enumerate(test_data):\n", + " print(f\"[{idx}]: {item['text']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64ab4764", + "metadata": {}, + "outputs": [], + "source": [ + "import librosa\n", + "import IPython.display as ipd\n", + "\n", + "# load and listen to the audio file example\n", + "example_file = test_data[0]['audio_filepath']\n", + "audio, sample_rate = librosa.load(example_file)\n", + "\n", + "file_id = 0\n", + "print(f\"[TEXT {file_id}]: {test_data[file_id]['text']}\\n\")\n", + "ipd.Audio(example_file, rate=sample_rate)" + ] + }, + { + "cell_type": "markdown", + "id": "a85ea8ec", + "metadata": {}, + "source": [ + "### Load ASR models\n", + "\n", + "For testing the CTC-WS method, we will use the following NeMo models:\n", + " - (CTC): [stt_en_fastconformer_ctc_large](https://huggingface.co/nvidia/stt_en_fastconformer_ctc_large) - a large fast-conformer model trained on English ASR data\n", + " - (Hybrid Transducer-CTC): [stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) - a large fast-conformer model trained jointly with CTC and Transducer heads on English ASR data. The model is streaming, which means it can process audio in real time. It can cause a slight WER degradation in comparison with the first offline model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d34ee0ba", + "metadata": { + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from nemo.collections.asr.models import EncDecCTCModelBPE, EncDecHybridRNNTCTCBPEModel\n", + "\n", + "# ctc model\n", + "ctc_model_name = \"stt_en_fastconformer_ctc_large\"\n", + "ctc_model = EncDecCTCModelBPE.from_pretrained(model_name=ctc_model_name)\n", + "\n", + "# hybrid transducer-ctc model\n", + "hybrid_ctc_rnnt_model_name = \"stt_en_fastconformer_hybrid_large_streaming_multi\"" + ] + }, + { + "cell_type": "markdown", + "id": "082208cd", + "metadata": {}, + "source": [ + "### Transcribe \n", + "Let's transcribe test data and analyze the regontion accuracy of specific words " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74436885", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "test_audio_files = [item['audio_filepath'] for item in test_data]\n", + "recog_results = ctc_model.transcribe(test_audio_files)" + ] + }, + { + "cell_type": "markdown", + "id": "b993d650", + "metadata": {}, + "source": [ + "### Compute per-word recognition statisctic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70f5714b", + "metadata": {}, + "outputs": [], + "source": [ + "import texterrors\n", + "\n", + "word_dict = {} # {word: [num_of_occurances, num_of_correct_recognition]}\n", + "eps = \"\"\n", + "ref_text = [item['text'] for item in test_data]\n", + "\n", + "for idx, ref in enumerate(ref_text):\n", + " ref = ref.split()\n", + " hyp = recog_results[idx].split()\n", + " texterrors_ali = texterrors.align_texts(ref, hyp, False)\n", + " ali = []\n", + " for i in range(len(texterrors_ali[0])):\n", + " ali.append((texterrors_ali[0][i], texterrors_ali[1][i]))\n", + "\n", + " for pair in ali:\n", + " word_ref, word_hyp = pair\n", + " if word_ref == eps:\n", + " continue\n", + " if word_ref in word_dict:\n", + " word_dict[word_ref][0] += 1\n", + " else:\n", + " word_dict[word_ref] = [1, 0]\n", + " if word_ref == word_hyp:\n", + " word_dict[word_ref][1] += 1\n", + "\n", + "word_candidats = {}\n", + "\n", + "for word in word_dict:\n", + " gt = word_dict[word][0]\n", + " tp = word_dict[word][1]\n", + " if tp/gt < 1.0:\n", + " word_candidats[word] = [gt, round(tp/gt, 2)]\n", + " \n", + "# print obtained per-word statistic\n", + "word_candidats_sorted = sorted(word_candidats.items(), key=lambda x:x[1][0], reverse=True)\n", + "max_word_len = max([len(x[0]) for x in word_candidats_sorted])\n", + "for item in word_candidats_sorted:\n", + " print(f\"{item[0]:<{max_word_len}} {item[1][0]}/{item[1][1]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "27a9f88b", + "metadata": {}, + "source": [ + "## Create a context-biasing list\n", + "\n", + "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n", + "Usually, we select only nontrivial words with the lowest recognition accuracy.\n", + "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n", + "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n", + "\n", + "The structure of the context-biasing file is:\n", + "\n", + "WORD1_TRANSCRIPTION1 \n", + "WORD2_TRANSCRIPTION1 \n", + "...\n", + "\n", + "TRANSCRIPTION here is a word spelling. We need this structure to add alternative transcriptions (spellings) for some word. We will cover such a case further." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c27848f0", + "metadata": {}, + "outputs": [], + "source": [ + "cb_words = [\"gpu\", \"nvidia\", \"nvidia's\", \"nvlink\", \"omniverse\", \"cunumeric\", \"numpy\", \"dgx\", \"dgxs\", \"dlss\",\n", + " \"cpu\", \"tsmc\", \"culitho\", \"xlabs\", \"tensorrt\", \"tensorflow\", \"pytorch\", \"aws\", \"chatgpt\", \"pcie\"]\n", + "\n", + "# write context-biasing file \n", + "cb_list_file = \"context_biasing_data/context_biasing_list.txt\"\n", + "with open(cb_list_file, \"w\", encoding=\"utf-8\") as fn:\n", + " for word in cb_words:\n", + " fn.write(f\"{word}_{word}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0e8c800", + "metadata": {}, + "outputs": [], + "source": [ + "!cat {cb_list_file}" + ] + }, + { + "cell_type": "markdown", + "id": "c44fc910", + "metadata": {}, + "source": [ + "## Run context-biasing evaluation\n", + "\n", + "The main script for CTC-WS context-biasing in NeMo is:\\\n", + "`{NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py`\n", + "\n", + "Context-biasing is managed by `apply_context_biasing` parameter [true or false]. \n", + "Other important context-biasing parameters are:\n", + "- `beam_threshold` - threshold for CTC-WS beam pruning\n", + "- `context_score` - per token weight for context biasing\n", + "- `ctc_ali_token_weight` - per token weight for CTC alignment (prevents false acceptances of context-biasing words) \n", + "\n", + "All the context-biasing parameters are selected according to the default values in the script. \n", + "You can tune them according to your data and ASR model (list all the values in the [] separated by commas) \n", + "for example: `beam_threshold=[7.0,8.0,9.0]`, `context_score=[3.0,4.0,5.0]`, `ctc_ali_token_weight=[0.5,0.6,0.7]`. \n", + "The script will run the recognition with all the combinations of the parameters and will select the best one based on WER value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a2d32e9", + "metadata": {}, + "outputs": [], + "source": [ + "# create directory with experimental results\n", + "import os\n", + "\n", + "exp_dir = \"exp\"\n", + "if not os.path.isdir(exp_dir):\n", + " os.makedirs(exp_dir)\n", + "else:\n", + " print(f\"Directory '{exp_dir}' already exists\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "116f2abe", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# ctc model (no context-biasing)\n", + "\n", + "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n", + " nemo_model_file={ctc_model_name} \\\n", + " input_manifest={test_nemo_manifest} \\\n", + " preds_output_folder={exp_dir} \\\n", + " decoder_type=\"ctc\" \\\n", + " acoustic_batch_size=64 \\\n", + " apply_context_biasing=false \\\n", + " context_file={cb_list_file} \\\n", + " beam_threshold=[7.0] \\\n", + " context_score=[3.0] \\\n", + " ctc_ali_token_weight=[0.5]" + ] + }, + { + "cell_type": "markdown", + "id": "674d0af1", + "metadata": {}, + "source": [ + "The results must be:\n", + "\n", + "`Precision`: 1.0000 (1/1) fp:0 (fp - false positive recognition) \n", + "`Recall`: 0.0333 (1/30) \n", + "`Fscore`: 0.0645 \n", + "`Greedy WER/CER` = 35.68%/8.16%\n", + "\n", + "The model could recognize 1 out of 30 words from the context-biasing list.\n", + "Let's enable context-biasing during decoding:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "239da41d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# ctc model (with context biasing)\n", + "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n", + " nemo_model_file={ctc_model_name} \\\n", + " input_manifest={test_nemo_manifest} \\\n", + " preds_output_folder={exp_dir} \\\n", + " decoder_type=\"ctc\" \\\n", + " acoustic_batch_size=64 \\\n", + " apply_context_biasing=true \\\n", + " context_file={cb_list_file} \\\n", + " beam_threshold=[7.0] \\\n", + " context_score=[3.0] \\\n", + " ctc_ali_token_weight=[0.5]" + ] + }, + { + "cell_type": "markdown", + "id": "faa1e73c", + "metadata": {}, + "source": [ + "Now, recognition results are much better:\n", + "\n", + "`Precision`: 1.0000 (21/21) fp:0 \n", + "`Recall`: 0.7000 (21/30) \n", + "`Fscore`: 0.8235 \n", + "`Greedy WER/CER` = 17.09%/4.43%\n", + "\n", + "But we are still able to recognize only 21 out of 30 specific words.\\\n", + "You can see that unrecognized words are mostly abbreviations (`dgxs`, `dlss`, `gpu`, `aws`, etc.) or compound words (`culitho`).\\\n", + "The ASR models tends to recognize such words as a sequence of characters (`\"aws\" -> \"a w s\"`) or subwords (`\"culitho\" -> \"cu litho\"`).\\\n", + "We can try to improve the recognition of such words by adding alternative transcriptions to the context-biasing list." + ] + }, + { + "cell_type": "markdown", + "id": "d72b6391", + "metadata": {}, + "source": [ + "### Alternative transcriptions\n", + "\n", + "wordninja is used to split compound words into simple words according to the default word dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7e00263", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install wordninja" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46fe91e9", + "metadata": {}, + "outputs": [], + "source": [ + "import wordninja\n", + "\n", + "cb_list_file_modified = cb_list_file + \".abbr_and_ninja\"\n", + "\n", + "with open(cb_list_file, \"r\", encoding=\"utf-8\") as fn1, \\\n", + " open(cb_list_file_modified, \"w\", encoding=\"utf-8\") as fn2:\n", + "\n", + " for line in fn1:\n", + " word = line.strip().split(\"_\")[0]\n", + " new_line = f\"{word}_{word}\"\n", + " # split all the short words into characters\n", + " if len(word) <= 4 and len(word.split()) == 1:\n", + " abbr = ' '.join(list(word))\n", + " new_line += f\"_{abbr}\"\n", + " # split the long words into the simple words (not for phrases)\n", + " new_segmentation = wordninja.split(word)\n", + " if word != new_segmentation[0]:\n", + " new_segmentation = ' '.join(new_segmentation)\n", + " new_line += f\"_{new_segmentation}\"\n", + " fn2.write(f\"{new_line}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da69da45", + "metadata": {}, + "outputs": [], + "source": [ + "!cat {cb_list_file_modified}" + ] + }, + { + "cell_type": "markdown", + "id": "4a21cbf4", + "metadata": {}, + "source": [ + "Run context-biasing with modified context-biasing list:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "913a0f5e", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# ctc models (with context biasing and modified cb list)\n", + "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n", + " nemo_model_file={ctc_model_name} \\\n", + " input_manifest={test_nemo_manifest} \\\n", + " preds_output_folder={exp_dir} \\\n", + " decoder_type=\"ctc\" \\\n", + " acoustic_batch_size=64 \\\n", + " apply_context_biasing=true \\\n", + " context_file={cb_list_file_modified} \\\n", + " beam_threshold=[7.0] \\\n", + " context_score=[3.0] \\\n", + " ctc_ali_token_weight=[0.5]" + ] + }, + { + "cell_type": "markdown", + "id": "654751ed", + "metadata": {}, + "source": [ + "Now, the recognition results are:\n", + "\n", + "`Precision`: 1.0000 (28/28) fp:1 \n", + "`Recall`: 0.9333 (28/30) \n", + "`Fscore`: 0.9655 \n", + "`Greedy WER/CER` = 7.04%/2.93%\n", + "\n", + "As you can see, that adding alternative transcriptions to the cb_list file improved the recognition accuracy of the context-biasing words. However, we still miss 2 words. Unfortunately, this algorithm is not a silver bullet.\n", + "\n", + "In some cases, you can improve results by adding alternative transcriptions manually based on the recognition errors of your ASR model for the specific words (for example, `\"nvidia\" -> \"n video\"`). " + ] + }, + { + "cell_type": "markdown", + "id": "b96c4023", + "metadata": {}, + "source": [ + "### Hybrid Transducer-CTC model\n", + "The CTC-WS context-biasing method for Transducer (RNN-T) models is supported only for Hybrid Transducer-CTC model. \n", + "To use Transducer head of the Hybrid Transducer-CTC model, we need to set `decoder_type=\"rnnt\"`. \n", + "Other parameters are the same as for the CTC model because the context-biasing is applied only on the CTC part of the model. Spotted context-biasing words will have been merged with greedy decoding results of the Transducer head.\n", + "\n", + "We can use already prepared context-biasing list because the CTC and Hybrid Transducer-CTC models have almost the same BPE tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "456e47df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Transducer model (no context-biasing)\n", + "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n", + " nemo_model_file={hybrid_ctc_rnnt_model_name} \\\n", + " input_manifest={test_nemo_manifest} \\\n", + " preds_output_folder={exp_dir} \\\n", + " decoder_type=\"rnnt\" \\\n", + " acoustic_batch_size=64 \\\n", + " apply_context_biasing=false \\\n", + " context_file={cb_list_file_modified} \\\n", + " beam_threshold=[7.0] \\\n", + " context_score=[3.0] \\\n", + " ctc_ali_token_weight=[0.5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "773e11f1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Transducer model (with context-biasing)\n", + "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n", + " nemo_model_file={hybrid_ctc_rnnt_model_name} \\\n", + " input_manifest={test_nemo_manifest} \\\n", + " preds_output_folder={exp_dir} \\\n", + " decoder_type=\"rnnt\" \\\n", + " acoustic_batch_size=64 \\\n", + " apply_context_biasing=true \\\n", + " context_file={cb_list_file_modified} \\\n", + " beam_threshold=[7.0] \\\n", + " context_score=[3.0] \\\n", + " ctc_ali_token_weight=[0.5]" + ] + }, + { + "cell_type": "markdown", + "id": "45a91385", + "metadata": {}, + "source": [ + "CTC-WS context-biasing works for Transducer model as well as for CTC (`F-score improvenment: 0.3784 -> 0.9286`). Differences in the nature of offline and online models may cause differences in results (usually, online models have a tendency to predict tokens earlier what can affect the difference between the timestamps of CTC and RNN-T models). " + ] + }, + { + "cell_type": "markdown", + "id": "1968e7bc", + "metadata": {}, + "source": [ + "## Practical part 2 (advanced)\n", + "In this section, we will consider the context-biasing process more deeply:\n", + "- Visualization of the context-biasing graph\n", + "- Running CTC-WS with the context-biasing graph\n", + "- Merge the obtained spotted words with greedy decoding results\n", + "- Analysis of the results" + ] + }, + { + "cell_type": "markdown", + "id": "277104b5", + "metadata": {}, + "source": [ + "### Build a context graph (for visualization only)\n", + "The context graph consists of a composition of a prefix tree (Trie) with the CTC transition topology for words and phrases from the context-biasing list. We use a BPE tokenizer from the target ASR model for word segmentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "904ea41b", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.asr.parts import context_biasing\n", + "\n", + "# get bpe tokenization\n", + "cb_words_small = [\"nvidia\", \"gpu\", \"nvlink\", \"numpy\"]\n", + "context_transcripts = []\n", + "for word in cb_words_small:\n", + " # use text_to_tokens method for viasualization only\n", + " word_tokenization = ctc_model.tokenizer.text_to_tokens(word)\n", + " print(f\"{word}: {word_tokenization}\")\n", + " context_transcripts.append([word, [word_tokenization]])\n", + "\n", + "# build context graph\n", + "context_graph = context_biasing.ContextGraphCTC(blank_id=\"⊘\")\n", + "context_graph.add_to_graph(context_transcripts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7fab1e1", + "metadata": {}, + "outputs": [], + "source": [ + "context_graph.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1c57878", + "metadata": {}, + "outputs": [], + "source": [ + "# install graphviz from source if you have problems with graph picture\n", + "# set instal_graphviz = True\n", + "# this may take about 5-10 minutes\n", + "\n", + "instal_graphviz = False\n", + "\n", + "if instal_graphviz:\n", + " !{NEMO_DIR_PATH}/scripts/installers/install_graphviz.sh" + ] + }, + { + "cell_type": "markdown", + "id": "04a6f4be", + "metadata": {}, + "source": [ + "### Build a real context graph (for decoding)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba2d8a1", + "metadata": {}, + "outputs": [], + "source": [ + "# get bpe tokenization\n", + "context_transcripts = []\n", + "for word in cb_words:\n", + " word_tokenization = [ctc_model.tokenizer.text_to_ids(x) for x in word]\n", + " context_transcripts.append([word, word_tokenization])\n", + "\n", + "# build context graph\n", + "context_graph = context_biasing.ContextGraphCTC(blank_id=ctc_model.decoding.blank_id)\n", + "context_graph.add_to_graph(context_transcripts)" + ] + }, + { + "cell_type": "markdown", + "id": "71e0e86b", + "metadata": {}, + "source": [ + "### Run CTC-based Word Spotter\n", + "\n", + "The CTC-WS task is to search for words by decoding CTC log probabilities using the context graph. As a result, we obtain a list of detected words with exact start/end frames in the audio file and their overall scores. The relatively small size of the context graph and hypotheses pruning methods allow this algorithm to work very quickly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2bc370b", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from tqdm.notebook import tqdm\n", + "\n", + "# get ctc logprobs\n", + "audio_file_paths = [item['audio_filepath'] for item in test_data]\n", + "\n", + "with torch.no_grad():\n", + " ctc_model.eval()\n", + " ctc_model.encoder.freeze()\n", + " device = next(ctc_model.parameters()).device\n", + " hyp_results = ctc_model.transcribe(audio_file_paths, batch_size=10, return_hypotheses=True)\n", + " ctc_logprobs = [hyp.alignments.cpu().numpy() for hyp in hyp_results]\n", + " blank_idx = ctc_model.decoding.blank_id\n", + " \n", + "# run ctc-based word spotter\n", + "ws_results = {}\n", + "for idx, logits in tqdm(\n", + " enumerate(ctc_logprobs), desc=f\"Eval CTC-based Word Spotter...\", total=len(ctc_logprobs)\n", + "):\n", + " ws_results[audio_file_paths[idx]] = context_biasing.run_word_spotter(\n", + " logits,\n", + " context_graph,\n", + " ctc_model,\n", + " blank_idx=blank_idx,\n", + " beam_threshold=7.0,\n", + " cb_weight=3.0,\n", + " ctc_ali_token_weight=0.5,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bd6645c", + "metadata": {}, + "outputs": [], + "source": [ + "# print CTC-WS hypotheses for the first audio file\n", + "ws_results[audio_file_paths[0]]" + ] + }, + { + "cell_type": "markdown", + "id": "245a66f0", + "metadata": {}, + "source": [ + "### Merge CTC-WS words with greedy CTC decoding results\n", + "\n", + "Use `print_stats=True` to get more information about spotted words and greedy CTC word alignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "423b2b9e", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "target_transcripts = [item['text'] for item in test_data]\n", + "\n", + "# merge spotted words with greedy results\n", + "for idx, logprobs in enumerate(ctc_logprobs):\n", + " greedy_predicts = np.argmax(logprobs, axis=1)\n", + " if ws_results[audio_file_paths[idx]]:\n", + " # make new text by mearging alignment with ctc-ws predictions:\n", + " print(\"\\n\" + \"********\" * 10)\n", + " print(f\"File name: {audio_file_paths[idx]}\")\n", + " pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(\n", + " greedy_predicts,\n", + " ctc_model,\n", + " ws_results[audio_file_paths[idx]],\n", + " decoder_type=\"ctc\",\n", + " blank_idx=blank_idx,\n", + " print_stats=True,\n", + " )\n", + " print(f\"[raw text]: {raw_text}\")\n", + " print(f\"[hyp text]: {pred_text}\")\n", + " print(f\"[ref text]: {target_transcripts[idx]}\")\n", + " else:\n", + " # if no spotted words, use standard greedy predictions\n", + " pred_text = ctc_model.wer.decoding.ctc_decoder_predictions_tensor(greedy_predicts)[0][0]" + ] + }, + { + "cell_type": "markdown", + "id": "fb8b5f51", + "metadata": {}, + "source": [ + "In these logs, you can find detailed context-biasing statistics about each audio file:\n", + "- Audio file name\n", + "- Greedy word alignment\n", + "- List of spotted words\n", + "- Text results:\n", + " - Greedy decoding (raw text)\n", + " - Text after applying context-biasing (hyp text)\n", + " - Ground truth transcription (ref text)\n", + " \n", + "These statistics can be helpful in case of problems with context-biasing word recognition. For example, Transducer models sometimes recognize tokens 1-2 frames earlier than CTC models. To solve this problem, you can shift the start frame of the detected word left in the CTC-WS sources." + ] + }, + { + "cell_type": "markdown", + "id": "11220db2", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This tutorial demonstrates how to use the CTC-WS context-biasing technique to improve the recognition accuracy of specific words in the case of CTC and Transducer (RNN-T) ASR models. The tutorial includes the methodology for creating the context-biasing list, improving recognition accuracy of abbreviations and compound words, visualization of the context-biasing process, and results analysis.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/asr/README.md b/tutorials/asr/README.md index 77f157acac0c..138e13f58a08 100644 --- a/tutorials/asr/README.md +++ b/tutorials/asr/README.md @@ -34,6 +34,7 @@ In this repository, you will find several tutorials discussing what is Automatic 13) `ASR_Example_CommonVoice_Finetuning`: Learn how to fine-tune an ASR model using CommonVoice to a new alphabet, Esperanto. We walk through the data processing steps of MCV data using HuggingFace Datasets, preparation of the tokenizer, model and then setup fine-tuning. +14) `ASR_Context_Biasing`: This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo framework for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter. ---------------- From da2d3c71bea5c862b6fc0a83e035914d3a60c407 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Thu, 14 Mar 2024 17:22:10 -0400 Subject: [PATCH 021/140] update for manifest loading (#8661) Signed-off-by: stevehuang52 --- .../asr/parts/utils/manifest_utils.py | 19 +++++++++++++++++-- .../asr/parts/utils/transcribe_utils.py | 11 +++++------ .../common/parts/preprocessing/manifest.py | 19 ++++++++++++++++++- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/nemo/collections/asr/parts/utils/manifest_utils.py b/nemo/collections/asr/parts/utils/manifest_utils.py index 71a35ceb3426..e9f91045c9a2 100644 --- a/nemo/collections/asr/parts/utils/manifest_utils.py +++ b/nemo/collections/asr/parts/utils/manifest_utils.py @@ -30,6 +30,7 @@ segments_manifest_to_subsegments_manifest, write_rttm2manifest, ) +from nemo.utils import logging from nemo.utils.data_utils import DataStoreObject @@ -476,10 +477,24 @@ def read_manifest(manifest: Union[Path, str]) -> List[dict]: f = open(manifest.get(), 'r', encoding='utf-8') except: raise Exception(f"Manifest file could not be opened: {manifest}") - for line in f: - item = json.loads(line) + + errors = [] + for line in f.readlines(): + line = line.strip() + if not line: + continue + try: + item = json.loads(line) + except json.JSONDecodeError: + errors.append(line) + continue data.append(item) f.close() + if errors: + logging.error(f"{len(errors)} Errors encountered while reading manifest file: {manifest}") + for error in errors: + logging.error(f"-- Failed to parse line: `{error}`") + raise RuntimeError(f"Errors encountered while reading manifest file: {manifest}") return data diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index e5cd8d7bbc10..681fab751e5f 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -26,7 +26,7 @@ import nemo.collections.asr as nemo_asr from nemo.collections.asr.metrics.wer import word_error_rate from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel -from nemo.collections.asr.parts.utils import rnnt_utils +from nemo.collections.asr.parts.utils import manifest_utils, rnnt_utils from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR, FrameBatchMultiTaskAED from nemo.collections.common.metrics.punct_er import OccurancePunctuationErrorRate from nemo.collections.common.parts.preprocessing.manifest import get_full_path @@ -295,10 +295,9 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]: return filepaths, partial_audio -def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> list[dict]: +def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict]: """Sorts the manifest if duration key is available for every utterance.""" - with open(path) as f: - items = [json.loads(l) for l in f] + items = manifest_utils.read_manifest(path) if try_sort and all("duration" in item for item in items): items = sorted(items, reverse=True, key=lambda item: item["duration"]) return items @@ -563,8 +562,8 @@ def compute_metrics_per_sample( manifest_path: str, reference_field: str = "text", hypothesis_field: str = "pred_text", - metrics: list[str] = ["wer"], - punctuation_marks: list[str] = [".", ",", "?"], + metrics: List[str] = ["wer"], + punctuation_marks: List[str] = [".", ",", "?"], output_manifest_path: str = None, ) -> dict: diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py index d3cc02fe3c68..1d49bd7c7019 100644 --- a/nemo/collections/common/parts/preprocessing/manifest.py +++ b/nemo/collections/common/parts/preprocessing/manifest.py @@ -15,6 +15,7 @@ import json import os import re +from collections import defaultdict from os.path import expanduser from typing import Any, Callable, Dict, Iterator, List, Optional, Union @@ -70,6 +71,7 @@ def item_iter( if parse_func is None: parse_func = __parse_item + errors = defaultdict(list) k = -1 logging.debug('Manifest files: %s', str(manifests_files)) for manifest_file in manifests_files: @@ -78,12 +80,27 @@ def item_iter( logging.debug('Cached at: %s', str(cached_manifest_file)) with open(expanduser(cached_manifest_file), 'r') as f: for line in f: + line = line.strip() + if not line: + continue k += 1 - item = parse_func(line, manifest_file) + try: + item = parse_func(line, manifest_file) + except json.JSONDecodeError: + errors[str(manifest_file)].append(line) + continue item['id'] = k yield item + if len(errors) > 0: + for filename, lines in errors.items(): + logging.error("=============================================") + logging.error(f"Failed to parse {len(lines)} lines from manifest file: {filename}") + for line in lines: + logging.error(f"-- Failed to parse line: `{line}`") + raise RuntimeError("Failed to parse some lines from manifest files. See logs for more details.") + def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]: item = json.loads(line) From 01aedc69d673a89c85d680c7bfe58a10d36f718c Mon Sep 17 00:00:00 2001 From: Rachit Garg Date: Thu, 14 Mar 2024 15:50:36 -0700 Subject: [PATCH 022/140] add the persistent_workers to the dataloader (#8654) Signed-off-by: rachitg Co-authored-by: rachitg --- .../nlp/models/language_modeling/megatron_gpt_sft_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 325f039d461b..7ab00f1af85a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -804,6 +804,7 @@ def build_data_loader(self, dataset, data_cfg, consumed_samples=0): collate_fn=collate_fn, num_workers=data_cfg.num_workers, pin_memory=data_cfg.pin_memory, + persistent_workers=True if data_cfg.num_workers > 0 else False, ) def setup_training_dataloader(self): From c9347b98038b704ee79da56bc7ee0e30e6161900 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 14 Mar 2024 15:55:09 -0700 Subject: [PATCH 023/140] LLM Embedding model (#8622) * config update Signed-off-by: arendu * save embeddings and some refac Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * entry point script for dumping embeddings to disk Signed-off-by: arendu * normalize query and pos_doc even if no soft negatives are used Signed-off-by: arendu * yaml for generation script Signed-off-by: arendu * all possible negatives Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * logging Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * need to update docstrings Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * headers and rename Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * log diff and fix cs logging Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * non-standard solution to get wandb logger to have the config Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * check for rank Signed-off-by: arendu * cfg working for multi gpu Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * MCoreMixin chages. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * using new commit of meg-LM Signed-off-by: arendu * default to use all layers for lora Signed-off-by: arendu * validation only uses hard negatives, val scores are batch agnostic Signed-off-by: arendu * minor reorg Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * metadata and bug fixes Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * dump embeddings with tracable ids, disabled val logs for the moment Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * val ids Signed-off-by: arendu * val ids by consumed samples Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * don't gather if not saving embs Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * init global step to allow consumed samples to be called in test time Signed-off-by: arendu * enable adapters with packed seq Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * Update PTL version in requirements Signed-off-by: Abhishree * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused import and comment val_iterator_done Signed-off-by: Abhishree * Override _link_checkpoint Signed-off-by: Abhishree * Temporarily disable GPU unit tests Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Temporarily comment out CPU Unit tests Signed-off-by: Abhishree * Remove precision arg from Trainer in convert_hf_llama_to_nemo.py Signed-off-by: Abhishree * Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py Signed-off-by: Abhishree * Temporarily disable NMT Training TP=2 test Signed-off-by: Abhishree * Fix val_step, test_step func API MegatronLMEncoderDecoderModel Signed-off-by: Abhishree * Enable NMT training TP=2 test Signed-off-by: Abhishree * Disable some unit tests Signed-off-by: Abhishree * Comment CI tests Signed-off-by: Abhishree * Comment resume part of BART Signed-off-by: Abhishree * Uncomment few lines from JenkinsFile Signed-off-by: Abhishree * Return len of dataloader in microbatches Signed-off-by: Abhishree * Fix _link_checkpoint 1) Add inject_model_parallel_rank to _link_checkpoint 2) Override super._link_checkpoint to remove condition check for rank 0 Signed-off-by: Abhishree * Check if using dist ckpt in _link_checkpoint Signed-off-by: Abhishree * Temporarily disable GPT with PP=2 Signed-off-by: Abhishree * Remove batch_idx arg from validation_step megatron_gpt_sft_model.py Signed-off-by: Abhishree * Use PTL bug fix branch Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files Signed-off-by: Abhishree * Temporarily disable test_ema_saved_state in test_ema.py Signed-off-by: Abhishree * Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py Signed-off-by: Abhishree * Use PTL with fs.lexists Signed-off-by: Abhishree * Comment _link_checkpoint related overrides In order to test with PTL without symbolic links Signed-off-by: Abhishree * Return only batch for dataloader_iter in DFT model Signed-off-by: Abhishree * Modify get_batch in GPTModel Signed-off-by: Abhishree * Add condition checks for batch extraction from dataloader_iter Signed-off-by: Abhishree * Add missing condition check for batch extraction in GPTModel Signed-off-by: Abhishree * Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder Signed-off-by: Abhishree * Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py Signed-off-by: Abhishree * Fix test invalid ckpts in test_exp_manager.py Also uncomment some of the commented out tests in JenkinsFile and test_ema.py Signed-off-by: Abhishree * Fix bug in test_invalid_checkpoints_removed_from_topk Signed-off-by: Abhishree * Fix validation step of GPTModel for finetuning case with multi dataloaders Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * multi dataloaders for validation query and docs Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * validation loop made more efficient with 2 dataloders Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * WIP test set generation Signed-off-by: arendu * generate working for multi dataloaders Signed-off-by: arendu * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * Update PTL version in requirements Signed-off-by: Abhishree * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused import and comment val_iterator_done Signed-off-by: Abhishree * Override _link_checkpoint Signed-off-by: Abhishree * Temporarily disable GPU unit tests Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Temporarily comment out CPU Unit tests Signed-off-by: Abhishree * Remove precision arg from Trainer in convert_hf_llama_to_nemo.py Signed-off-by: Abhishree * Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py Signed-off-by: Abhishree * Temporarily disable NMT Training TP=2 test Signed-off-by: Abhishree * Fix val_step, test_step func API MegatronLMEncoderDecoderModel Signed-off-by: Abhishree * Enable NMT training TP=2 test Signed-off-by: Abhishree * Disable some unit tests Signed-off-by: Abhishree * Comment CI tests Signed-off-by: Abhishree * Comment resume part of BART Signed-off-by: Abhishree * Uncomment few lines from JenkinsFile Signed-off-by: Abhishree * Return len of dataloader in microbatches Signed-off-by: Abhishree * Fix _link_checkpoint 1) Add inject_model_parallel_rank to _link_checkpoint 2) Override super._link_checkpoint to remove condition check for rank 0 Signed-off-by: Abhishree * Check if using dist ckpt in _link_checkpoint Signed-off-by: Abhishree * Temporarily disable GPT with PP=2 Signed-off-by: Abhishree * Remove batch_idx arg from validation_step megatron_gpt_sft_model.py Signed-off-by: Abhishree * Use PTL bug fix branch Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files Signed-off-by: Abhishree * Temporarily disable test_ema_saved_state in test_ema.py Signed-off-by: Abhishree * Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py Signed-off-by: Abhishree * Use PTL with fs.lexists Signed-off-by: Abhishree * Comment _link_checkpoint related overrides In order to test with PTL without symbolic links Signed-off-by: Abhishree * Return only batch for dataloader_iter in DFT model Signed-off-by: Abhishree * Modify get_batch in GPTModel Signed-off-by: Abhishree * Add condition checks for batch extraction from dataloader_iter Signed-off-by: Abhishree * Add missing condition check for batch extraction in GPTModel Signed-off-by: Abhishree * Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder Signed-off-by: Abhishree * Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py Signed-off-by: Abhishree * Fix test invalid ckpts in test_exp_manager.py Also uncomment some of the commented out tests in JenkinsFile and test_ema.py Signed-off-by: Abhishree * Fix bug in test_invalid_checkpoints_removed_from_topk Signed-off-by: Abhishree * Fix validation step of GPTModel for finetuning case with multi dataloaders Signed-off-by: Abhishree * Fix test_step_outputs for SFT in GPTMOdel Signed-off-by: Abhishree * Pass dataloader_idx for val_step of GPTModel and remove unwanted code 1) Pass dataloader_idx to val_step of GPTModel as its required for GPTSFTModel in case multi dataloaders to append the outputs correctly val/test_step_output 2) Remove val_iterator_done check from all megatron GPT models Signed-off-by: Abhishree * Add condition check for extraction of batch in T5SFTModel & LMEncoderDecoder Signed-off-by: Abhishree * Add condition check for extracting batch in MegatronNMTModel Also uncomment GPT PP=2 and NMT tests from JenkinsFIle Signed-off-by: Abhishree * Fix typo and uncomment multimodel tests Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * default names Signed-off-by: arendu * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * Update PTL version in requirements Signed-off-by: Abhishree * Add the following changes for PTL 2.1 1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1 2) Make precision as None when using precision plugin in MegatronTrainerBuilder 3) Change dataloader_iter API for some megatron model Signed-off-by: Abhishree * Change dataloader_iter API and remove val_iterator_done 1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model 2) Comment self._val_iterator_done for all megatron models Signed-off-by: Abhishree * Override format_checkpoint_nae and fix dataloader_iter API Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused import and comment val_iterator_done Signed-off-by: Abhishree * Override _link_checkpoint Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Temporarily comment out CPU Unit tests Signed-off-by: Abhishree * Remove precision arg from Trainer in convert_hf_llama_to_nemo.py Signed-off-by: Abhishree * Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py Signed-off-by: Abhishree * Temporarily disable NMT Training TP=2 test Signed-off-by: Abhishree * Fix val_step, test_step func API MegatronLMEncoderDecoderModel Signed-off-by: Abhishree * Enable NMT training TP=2 test Signed-off-by: Abhishree * Disable some unit tests Signed-off-by: Abhishree * Comment CI tests Signed-off-by: Abhishree * Comment resume part of BART Signed-off-by: Abhishree * Uncomment few lines from JenkinsFile Signed-off-by: Abhishree * Return len of dataloader in microbatches Signed-off-by: Abhishree * Fix _link_checkpoint 1) Add inject_model_parallel_rank to _link_checkpoint 2) Override super._link_checkpoint to remove condition check for rank 0 Signed-off-by: Abhishree * Check if using dist ckpt in _link_checkpoint Signed-off-by: Abhishree * Remove batch_idx arg from validation_step megatron_gpt_sft_model.py Signed-off-by: Abhishree * Use PTL bug fix branch Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files Signed-off-by: Abhishree * Temporarily disable test_ema_saved_state in test_ema.py Signed-off-by: Abhishree * Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py Signed-off-by: Abhishree * Use PTL with fs.lexists Signed-off-by: Abhishree * Comment _link_checkpoint related overrides In order to test with PTL without symbolic links Signed-off-by: Abhishree * Return only batch for dataloader_iter in DFT model Signed-off-by: Abhishree * Modify get_batch in GPTModel Signed-off-by: Abhishree * Add condition checks for batch extraction from dataloader_iter Signed-off-by: Abhishree * Add missing condition check for batch extraction in GPTModel Signed-off-by: Abhishree * Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder Signed-off-by: Abhishree * Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py Signed-off-by: Abhishree * Fix test invalid ckpts in test_exp_manager.py Also uncomment some of the commented out tests in JenkinsFile and test_ema.py Signed-off-by: Abhishree * Fix bug in test_invalid_checkpoints_removed_from_topk Signed-off-by: Abhishree * Fix validation step of GPTModel for finetuning case with multi dataloaders Signed-off-by: Abhishree * Fix test_step_outputs for SFT in GPTMOdel Signed-off-by: Abhishree * Pass dataloader_idx for val_step of GPTModel and remove unwanted code 1) Pass dataloader_idx to val_step of GPTModel as its required for GPTSFTModel in case multi dataloaders to append the outputs correctly val/test_step_output 2) Remove val_iterator_done check from all megatron GPT models Signed-off-by: Abhishree * Add condition check for extraction of batch in T5SFTModel & LMEncoderDecoder Signed-off-by: Abhishree * Add condition check for extracting batch in MegatronNMTModel Also uncomment GPT PP=2 and NMT tests from JenkinsFIle Signed-off-by: Abhishree * Fix typo and uncomment multimodel tests Signed-off-by: Abhishree * Change to new dataloader_iter API for MultiModal Signed-off-by: Abhishree * Fix new dataloader_api for MegatronLatenDiffusion Model Signed-off-by: Abhishree * Store and restore precision value in MegatronGPTSFTModel Signed-off-by: Abhishree * Temporarily comment Multimodal Stable Diffusion Train Signed-off-by: Abhishree * Update JenkinsFile for multimodal with latest main Signed-off-by: Abhishree * Upgrade PTL to version 2.2 in reqs Signed-off-by: Abhishree * Install PTL 2.2 from fork Signed-off-by: Abhishree * Add strict arg to load_model_state_dict func in NLPDDPStrategy Signed-off-by: Abhishree * Delete megatron_t5_adapter_tuning.py, megatron_t5_ia3_tuning.py These files were added in the branch by mistake Signed-off-by: Abhishree * Delete megatron_t5_prompt_learning.py that got added by mistake Signed-off-by: Abhishree * Add appropriate comments, code clean up Signed-off-by: Abhishree * Remove PTL installation from JenkinsFile Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update Signed-off-by: arendu * llm embeddings with ptl2.2 Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * global in batch negatives using all gather Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove old files Signed-off-by: arendu * remove changes in untouched files Signed-off-by: arendu * inference for embedding model from ckpt Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Signed-off-by: Jiaqi Zeng Signed-off-by: Adi Renduchintala Signed-off-by: Abhishree Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jiaqi Zeng Co-authored-by: Tugrul Konuk Co-authored-by: Abhishree Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: Eric Harper --- Jenkinsfile | 356 +++++++------- ...megatron_gpt_embedder_generate_config.yaml | 216 +++++++++ .../megatron_gpt_embedder_tuning_config.yaml | 212 +++++++++ .../megatron_gpt_embedding_finetuning.py | 74 +++ .../megatron_gpt_embedding_generate.py | 135 ++++++ .../conf/megatron_gpt_finetuning_config.yaml | 2 +- .../gpt_embedding_dataset.py | 281 ++++++++++++ .../megatron_gpt_embedding_model.py | 433 ++++++++++++++++++ .../language_modeling/megatron_base_model.py | 2 + .../language_modeling/megatron_gpt_model.py | 35 +- .../megatron_gpt_sft_model.py | 234 +++++----- .../megatron_lm_encoder_decoder_model.py | 2 - .../nlp/parts/mixins/nlp_adapter_mixins.py | 3 +- .../construct_random_negatives.py | 0 .../information_retrieval/get_msmarco.sh | 0 15 files changed, 1699 insertions(+), 286 deletions(-) create mode 100644 examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml create mode 100644 examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml create mode 100644 examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py create mode 100644 examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py create mode 100644 nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py create mode 100644 nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py rename {examples/nlp/information_retrieval => scripts}/construct_random_negatives.py (100%) rename {examples/nlp => scripts}/information_retrieval/get_msmarco.sh (100%) diff --git a/Jenkinsfile b/Jenkinsfile index 602c78890262..b278a53d8213 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -85,6 +85,14 @@ pipeline { } } + stage('Pytorch lightning installation') { + steps { + sh 'git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \ + cd pytorch-lightning && \ + PACKAGE_NAME=pytorch pip install -e .' + } + } + // pip package should be working with main, if not we can update the commit here // until the pip package is updated stage('Megatron Core installation') { @@ -147,8 +155,8 @@ pipeline { stage('L2: Multimodal Imagen Train') { when { anyOf { - branch 'r1.23.0' - changeRequest target: 'r1.23.0' + branch 'main' + changeRequest target: 'main' } } failFast true @@ -161,7 +169,6 @@ pipeline { trainer.devices=1 \ ++exp_manager.max_time_per_run=00:00:03:00 \ trainer.max_steps=20 \ - model.conditioning.embed_dim=64 \ model.micro_batch_size=1 \ model.global_batch_size=1 \ model.data.synthetic_data=True \ @@ -173,11 +180,12 @@ pipeline { sh "rm -rf /home/TestData/multimodal/imagen_train" } } + stage('L2: Multimodal Stable Diffusion Train') { when { anyOf { - branch 'r1.23.0' - changeRequest target: 'r1.23.0' + branch 'main' + changeRequest target: 'main' } } failFast true @@ -204,81 +212,79 @@ pipeline { model.unet_config.from_pretrained=null \ model.first_stage_config.from_pretrained=null \ model.unet_config.use_flash_attention=False \ - model.unet_config.attention_resolutions=[1] \ - model.unet_config.channel_mult=[1] \ " sh "pip install 'webdataset>=0.1.48,<=0.1.62'" sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" } } -// stage('L2: Multimodal ControlNet Train') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/controlnet_train" -// sh "pip install webdataset==0.2.48" -// sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \ -// trainer.precision=16 \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// model.data.synthetic_data=True \ -// exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \ -// model.inductor=False \ -// model.image_logger.max_images=0 \ -// model.control_stage_config.params.from_pretrained_unet=null \ -// model.unet_config.from_pretrained=null \ -// model.first_stage_config.from_pretrained=null \ -// model.unet_config.use_flash_attention=False \ -// " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/multimodal/controlnet_train" -// } -// } -// stage('L2: Multimodal DreamBooth Train') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/dreambooth_train" -// sh "pip install webdataset==0.2.48" -// sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \ -// trainer.precision=16 \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \ -// model.inductor=False \ -// model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ -// ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ -// ++model.cond_stage_config.max_length=77 \ -// ~model.cond_stage_config.restore_from_path \ -// ~model.cond_stage_config.freeze \ -// ~model.cond_stage_config.layer \ -// model.unet_config.from_pretrained=null \ -// model.first_stage_config.from_pretrained=null \ -// model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \ -// model.unet_config.use_flash_attention=False \ -// " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/multimodal/dreambooth_train" -// } -// } + stage('L2: Multimodal ControlNet Train') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/controlnet_train" + sh "pip install webdataset==0.2.48" + sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \ + trainer.precision=16 \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.synthetic_data=True \ + exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \ + model.inductor=False \ + model.image_logger.max_images=0 \ + model.control_stage_config.params.from_pretrained_unet=null \ + model.unet_config.from_pretrained=null \ + model.first_stage_config.from_pretrained=null \ + model.unet_config.use_flash_attention=False \ + " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/multimodal/controlnet_train" + } + } + stage('L2: Multimodal DreamBooth Train') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/dreambooth_train" + sh "pip install webdataset==0.2.48" + sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \ + trainer.precision=16 \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \ + model.inductor=False \ + model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ + ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ + ++model.cond_stage_config.max_length=77 \ + ~model.cond_stage_config.restore_from_path \ + ~model.cond_stage_config.freeze \ + ~model.cond_stage_config.layer \ + model.unet_config.from_pretrained=null \ + model.first_stage_config.from_pretrained=null \ + model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \ + model.unet_config.use_flash_attention=False \ + " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/multimodal/dreambooth_train" + } + } stage('L2: Vision ViT Pretrain TP=1') { when { anyOf { @@ -2725,106 +2731,106 @@ pipeline { } } } - stage('L2: Megatron NMT Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" - // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" - sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results" - } - } + // stage('L2: Megatron NMT Training TP=2') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // steps { + // sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ + // trainer.devices=2 \ + // trainer.accelerator=gpu \ + // trainer.log_every_n_steps=1 \ + // trainer.val_check_interval=10 \ + // +trainer.limit_val_batches=2 \ + // trainer.accumulate_grad_batches=1 \ + // trainer.max_steps=10 \ + // trainer.precision=16 \ + // trainer.gradient_clip_val=1.0 \ + // exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ + // model.tensor_model_parallel_size=2 \ + // model.seq_length=128 \ + // model.encoder.num_layers=4 \ + // model.encoder.hidden_size=64 \ + // model.encoder.num_attention_heads=8 \ + // model.encoder.activation='swiglu' \ + // model.encoder.masked_softmax_fusion=False \ + // model.encoder.bias_activation_fusion=False \ + // model.encoder.activations_checkpoint_method='block' \ + // model.encoder.activations_checkpoint_num_layers=1 \ + // model.decoder.num_layers=2 \ + // model.decoder.hidden_size=64 \ + // model.decoder.num_attention_heads=8 \ + // model.decoder.activation='swiglu' \ + // model.decoder.masked_softmax_fusion=False \ + // model.decoder.bias_activation_fusion=False \ + // model.decoder.activations_checkpoint_method='block' \ + // model.decoder.activations_checkpoint_num_layers=1 \ + // model.micro_batch_size=2 \ + // model.global_batch_size=4 \ + // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + // model.train_ds.num_workers=1 \ + // model.validation_ds.num_workers=1 \ + // ~model.test_ds \ + // model.train_ds.dataset_type=text_memmap \ + // model.encoder_tokenizer.library=sentencepiece \ + // model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + // model.decoder_tokenizer.library=sentencepiece \ + // model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" + // // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error + // // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() + // sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ + // trainer.devices=2 \ + // trainer.accelerator=gpu \ + // trainer.log_every_n_steps=1 \ + // trainer.val_check_interval=1 \ + // +trainer.limit_val_batches=2 \ + // trainer.accumulate_grad_batches=1 \ + // trainer.max_steps=10 \ + // trainer.precision=16 \ + // trainer.gradient_clip_val=1.0 \ + // exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ + // model.tensor_model_parallel_size=2 \ + // model.seq_length=128 \ + // model.encoder.num_layers=4 \ + // model.encoder.hidden_size=64 \ + // model.encoder.num_attention_heads=8 \ + // model.encoder.activation='swiglu' \ + // model.encoder.masked_softmax_fusion=False \ + // model.encoder.bias_activation_fusion=False \ + // model.encoder.activations_checkpoint_method='block' \ + // model.encoder.activations_checkpoint_num_layers=1 \ + // model.decoder.num_layers=2 \ + // model.decoder.hidden_size=64 \ + // model.decoder.num_attention_heads=8 \ + // model.decoder.activation='swiglu' \ + // model.decoder.masked_softmax_fusion=False \ + // model.decoder.bias_activation_fusion=False \ + // model.decoder.activations_checkpoint_method='block' \ + // model.decoder.activations_checkpoint_num_layers=1 \ + // model.micro_batch_size=2 \ + // model.global_batch_size=4 \ + // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + // model.train_ds.num_workers=1 \ + // model.validation_ds.num_workers=1 \ + // ~model.test_ds \ + // model.train_ds.dataset_type=text_memmap \ + // model.encoder_tokenizer.library=sentencepiece \ + // model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + // model.decoder_tokenizer.library=sentencepiece \ + // model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" + // sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results" + // } + // } stage('L2: Megatron BART Perceiver MIM Training TP=2') { // Testing Megatron hidden transformations when { diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml new file mode 100644 index 000000000000..778dc937efdc --- /dev/null +++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml @@ -0,0 +1,216 @@ +name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: 16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.test_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${model.tensor_model_parallel_size} + always_save_nemo: True + save_best_model: True + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 1 + micro_batch_size: 1 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + temperature: 0.8 + num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only + + peft: + peft_scheme: "lora" # can be either adapter,ia3, or ptuning + restore_from_path: null + restore_from_ckpt: + checkpoint_dir: null + checkpoint_name: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) + adapter_dim: 32 + alpha: ${peft.lora_tuning.adapter_dim} + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + selective_tuning: + tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre + + data: + return_output_tensors: True + test_ds: + query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds. + doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds. + names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics. + global_batch_size: 1 + micro_batch_size: 1 + shuffle: False + num_workers: 0 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + add_eos: True + add_bos: False + write_embeddings_to_file: True + output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to. + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + +inference: + greedy: True # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + all_probs: False # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False + outfile_path: output.txt + compute_attention_mask: True + +# server-related configs +server: False # whether launch the API server +port: 5555 # the port number for the inference server +web_server: False # whether launch the web inference server +share: True # whether create a public URL +username: test # user name for web client +password: test2 # password for web client +web_port: 9889 # the port number of the web server 1058 +chat: False # use the chat interface +chatbot_config: + value: False # whether to inject the value attributes + attributes: + - name: Quality + min: 0 + max: 4 + key: quality + type: int + default: 4 + - name: Toxicity + min: 0 + max: 4 + key: toxcity + type: int + default: 0 + - name: Humor + min: 0 + max: 4 + key: humor + type: int + default: 0 + - name: Creativity + min: 0 + max: 4 + key: creativity + type: int + default: 0 + - name: Violence + min: 0 + max: 4 + key: violence + type: int + default: 0 + - name: Helpfulness + min: 0 + max: 4 + key: helpfulness + type: int + default: 4 + - name: Not_Appropriate + min: 0 + max: 4 + key: not_appropriate + type: int + default: 0 + - name: Language + choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh'] + key: lang + type: list + default: en + + user: User + assistant: Assistant + system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml new file mode 100644 index 000000000000..efd5271884ed --- /dev/null +++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml @@ -0,0 +1,212 @@ +name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: 16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + temperature: 0.8 + num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only + use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only + + peft: + peft_scheme: "lora" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] # + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + selective_tuning: + tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre + + data: + return_output_tensors: True + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: ??? # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + label_key: 'output' + add_eos: True + add_bos: False + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + validation_ds: + query_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + doc_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + label_key: ${model.data.train_ds.label_key} + add_eos: ${model.data.train_ds.add_eos} + add_bos: ${model.data.train_ds.add_bos} + write_embeddings_to_file: False + output_file_path_prefix: "validation_embeddings" # Prefix of the file to write predictions to. + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + add_eos: ${model.data.train_ds.add_eos} + add_bos: ${model.data.train_ds.add_bos} + write_predictions_to_file: True + output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to. + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false \ No newline at end of file diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py new file mode 100644 index 000000000000..e1fe28cc892f --- /dev/null +++ b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import MutableMapping + +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf +from pytorch_lightning.loggers import WandbLogger + +from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + +mp.set_start_method("spawn", force=True) + + +def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping: + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, MutableMapping): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_tuning_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + exp_manager(trainer, cfg.exp_manager) + + model_cfg = MegatronGPTEmbeddingModel.merge_cfg_with(cfg.model.restore_from_path, cfg) + if trainer.global_rank == 0: + for logger in trainer.loggers: + if isinstance(logger, WandbLogger): + fd = flatten_dict(dict(model_cfg), sep="/") + logger.experiment.config.update(fd) + model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) + peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] + + if cfg.model.peft.restore_from_path is not None: + # initialize peft weights from a checkpoint instead of randomly + # This is not the same as resume training because optimizer states are not restored. + logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) + model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg)) + elif peft_cfg_cls is not None: + logging.info("Adding adapter weights to the model for PEFT") + model.add_adapter(peft_cfg_cls(model_cfg)) + else: + logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py new file mode 100644 index 000000000000..8cddcebbab62 --- /dev/null +++ b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py @@ -0,0 +1,135 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import os +import threading +from functools import partial + +import torch +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf, open_dict + +from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel +from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer +from nemo.collections.nlp.modules.common.text_generation_utils import generate +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.model_utils import inject_model_parallel_rank + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + +mp.set_start_method("spawn", force=True) + + +def use_inference_server(cfg, model, trainer): + if not HAVE_MEGATRON_CORE: + raise ValueError('Megatron-core needs to be installed to use this feature!') + + from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo + + trainer.test(model, dataloaders=None) + + if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: + if cfg.web_server: + if cfg.chat: + defaults = { + 'user': cfg.chatbot_config.user, + 'assistant': cfg.chatbot_config.assistant, + 'system': cfg.chatbot_config.system, + } + web_ui = partial( + get_chatbot_demo, + defaults=defaults, + value=cfg.chatbot_config.value, + attributes=cfg.chatbot_config.attributes, + ) + else: + web_ui = get_demo + loop = asyncio.new_event_loop() + thread = threading.Thread( + target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), + ) + thread.start() + server = MegatronServer(model.cuda()) + server.run("0.0.0.0", port=cfg.port) + + while True: + choice = torch.cuda.LongTensor(1) + torch.distributed.broadcast(choice, 0) + if choice[0].item() == 0: + generate(model.cuda()) + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_generate_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + + if cfg.model.peft.restore_from_path: + model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) + else: + model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) + + with open_dict(model_cfg): + model_cfg.data.return_output_tensors = True + model_cfg.post_process = False + + model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) + + if cfg.model.peft.restore_from_path: + model.load_adapters(cfg.model.peft.restore_from_path) + elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name: + peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] + checkpoint_path = os.path.join( + cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name + ) + # checkpoint_path is a dir in case of distributed checkpointing + if not os.path.isdir(checkpoint_path): + # legacy checkpoint needs model parallel rank injection + checkpoint_path = inject_model_parallel_rank( + os.path.join( + cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name + ) + ) + model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg)) + else: + raise NotImplementedError("distributed checkpointing of PEFT weights is not supported") + + model.freeze() + logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") + + if not cfg.model.get('use_flash_attention', False): + cfg.inference.compute_attention_mask = True + config = OmegaConf.to_container(cfg.inference, resolve=True) + model.set_inference_config(config) + + if not cfg.server: + trainer.test(model) + else: + use_inference_server(cfg, model, trainer) + + +if __name__ == "__main__": + main() diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml index 40347f317fbb..a50b578b95f4 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml @@ -101,7 +101,7 @@ model: position_embedding_strategy: null # used only when weight_tying is True lora_tuning: - target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) + target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) adapter_dim: 32 alpha: ${model.peft.lora_tuning.adapter_dim} adapter_dropout: 0.0 diff --git a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py new file mode 100644 index 000000000000..352aff87217b --- /dev/null +++ b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py @@ -0,0 +1,281 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Mapping, Optional + +import datasets +import numpy as np +import torch + +# hack to avoid the "not enough disk space" error in some slurm cluster +datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True + +from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec +from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping +from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset +from nemo.core.classes import Dataset +from nemo.utils import logging + +__all__ = ['GPTEmbeddingDataset'] + + +class GPTEmbeddingDataset(Dataset): + def __init__( + self, + file_path: str, + tokenizer: TokenizerSpec, + max_seq_length: int = 1024, + min_seq_length: int = 1, + add_bos: bool = False, + add_eos: bool = True, + max_num_samples: int = None, + seed: int = 1234, + index_mapping_dir: str = None, + virtual_tokens: int = 0, + memmap_workers: Optional[int] = None, + truncation_method: str = 'right', + special_tokens: Optional[Mapping[str, str]] = None, # special tokens, a dictory of {token_type: token} + data_type: str = 'train', # train, query or doc + ): + """ + file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. + tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). + max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. + min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. + add_bos (bool): Whether to add a beginning of sentence token to each data example + add_eos (bool): Whether to add an end of sentence token to each data example + seed: Random seed for data shuffling. + max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. + index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset. + truncation_method: Truncation from which position. Options: ['left', 'right'] + special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} + """ + # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare) + self.tokenizer = tokenizer + self.file_path = file_path + self.max_seq_length = max_seq_length + self.min_seq_length = min_seq_length + self.add_bos = add_bos + self.add_eos = add_eos + self.max_num_samples = max_num_samples + self.seed = seed + self.index_mapping_dir = index_mapping_dir + self.virtual_tokens = virtual_tokens + self.truncation_method = truncation_method + if special_tokens is None: + self.special_tokens = { + "system_turn_start": "", + "turn_start": "", + "label_start": "", + "end_of_turn": "\n", + "end_of_name": "\n", + } + else: + self.special_tokens = special_tokens + self.data_type = data_type + + self.indexed_dataset = JSONLMemMapDataset( + dataset_paths=[file_path], + tokenizer=None, + header_lines=0, + index_mapping_dir=index_mapping_dir, + workers=memmap_workers, + ) + + # Will be None after this call if `max_num_samples` is None + self.samples_mapping = None + self._build_samples_mapping() + + def _build_samples_mapping(self): + if self.max_num_samples is not None: + self.samples_mapping = get_samples_mapping( + indexed_dataset=self.indexed_dataset, + data_prefix=self.file_path, + num_epochs=None, + max_num_samples=self.max_num_samples, + max_seq_length=self.max_seq_length - 2, + short_seq_prob=0, + seed=self.seed, + name=self.file_path.split('/')[-1], + binary_head=False, + index_mapping_dir=self.index_mapping_dir, + ) + else: + self.samples_mapping = None + + def __len__(self): + if self.max_num_samples is None: + return len(self.indexed_dataset) + else: + assert self.samples_mapping is not None + return len(self.samples_mapping) + + def __getitem__(self, idx): + if isinstance(idx, np.int64): + idx = idx.item() + + if self.samples_mapping is not None: + assert idx < len(self.samples_mapping) + idx, _, _ = self.samples_mapping[idx] + if isinstance(idx, np.uint32): + idx = idx.item() + + assert idx < len(self.indexed_dataset) + # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1 + if idx < 0: + idx = len(self) + idx + auto_gen_idx = True + else: + auto_gen_idx = False + try: + example = self.indexed_dataset[idx] + if auto_gen_idx: + example['__AUTOGENERATED__'] = True + except Exception as e: + logging.error(f"Error while loading example {idx} from dataset {self.file_path}") + raise e + return self._process_example(example) + + def _process_example(self, example): + """ + Create an example by concatenating text and answer. + Truncation is carried out when needed, but it is performed only on the prompt side. + BOS, EOS, and SEP, are added if specified. + """ + metadata = {k: v for k, v in example.items()} + if self.data_type == 'train': + q = self.tokenizer.text_to_ids("query: " + example['query'].strip()) + d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip()) + nd = self.tokenizer.text_to_ids("passage: " + example['neg_doc'].strip()) + elif self.data_type == 'query': + q = self.tokenizer.text_to_ids("query: " + example['query'].strip()) + d, nd = None, None + assert "query_id" in example, "query_id is required for query dataset" + assert "doc_id" in example, "doc_id is required for query dataset" + elif self.data_type == 'doc': + d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip()) + assert "doc_id" in example, "doc_id is required for doc dataset" + q, nd = None, None + else: + raise ValueError(f"Invalid data type: {self.data_type}") + + q = q if q is not None else [] + d = d if d is not None else [] + nd = nd if nd is not None else [] + + if self.virtual_tokens: + # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context + # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used) + q = [self.tokenizer.eos_id] * self.virtual_tokens + q # type: ignore + d = [self.tokenizer.eos_id] * self.virtual_tokens + d # type: ignore + nd = [self.tokenizer.eos_id] * self.virtual_tokens + nd # type: ignore + + if self.add_bos: + q = [self.tokenizer.bos_id] + q # type: ignore + d = [self.tokenizer.bos_id] + d # type: ignore + nd = [self.tokenizer.bos_id] + nd # type: ignore + + # TODO: (@adithyare) should probably add a warning before truncation + q = q[: self.max_seq_length - 1] + d = d[: self.max_seq_length - 1] + nd = nd[: self.max_seq_length - 1] + + if self.add_eos: + q = q + [self.tokenizer.eos_id] # type: ignore + d = d + [self.tokenizer.eos_id] # type: ignore + nd = nd + [self.tokenizer.eos_id] # type: ignore + + processed_example = { + 'query': q, + 'pos_doc': d, + 'neg_doc': nd, + 'metadata': metadata, + } + + return processed_example + + def _maybe_cast_to_list(self, x): + if isinstance(x, np.ndarray): + return [item.tolist() for item in x] + return x + + def _ceil_to_nearest(self, n, m): + return (n + m - 1) // m * m + + def _collate_item(self, item, max_length, pad_id): + item = self._maybe_cast_to_list(item) + # max_length = max([len(x) for x in item]) if item else 0 + # here [0] should be tokenizer.pad_id + item = [x + [pad_id] * (max_length - len(x)) for x in item] + return item + + @torch.no_grad() + def _create_attention_mask(self, max_length): + """Create `attention_mask`. + Args: + input_ids: A 1D tensor that holds the indices of tokens. + """ + # seq_length = len(input_ids) + # `attention_mask` has the shape of [1, seq_length, seq_length] + attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0) + attention_mask = attention_mask < 0.5 + return attention_mask + + def collate_fn(self, batch): + input_ids = [] + metadata = [] + lengths = [] + max_length = -1 + for item in batch: + metadata.append(item['metadata']) + if self.data_type == 'train': + input_ids.append(item['query']) + lengths.append(len(item['query'])) + input_ids.append(item['pos_doc']) + lengths.append(len(item['pos_doc'])) + input_ids.append(item['neg_doc']) + lengths.append(len(item['neg_doc'])) + max_length = max(max_length, len(item['query']), len(item['pos_doc']), len(item['neg_doc'])) + elif self.data_type == 'query': + input_ids.append(item['query']) + lengths.append(len(item['query'])) + max_length = max(max_length, len(item['query'])) + elif self.data_type == 'doc': + input_ids.append(item['pos_doc']) + lengths.append(len(item['pos_doc'])) + max_length = max(max_length, len(item['pos_doc'])) + else: + raise ValueError(f"Invalid data type: {self.data_type}") + + max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16)) + assert max_length <= self.max_seq_length + + attention_mask = [self._create_attention_mask(max_length) for _ in batch] + attention_mask = torch.stack(attention_mask) + position_ids = [list(range(max_length)) for _ in batch] + position_ids = torch.LongTensor(position_ids) + input_ids = torch.LongTensor( + self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id) + ) + lengths = torch.LongTensor(lengths) - 1 # subtract 1 to account for the eos token + + processed_batch = { + 'tokens': input_ids, + 'attention_mask': attention_mask, + 'loss_mask': lengths, + 'position_ids': position_ids, + 'metadata': metadata, + } + + return processed_batch diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py new file mode 100644 index 000000000000..91fa4a6f92b5 --- /dev/null +++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py @@ -0,0 +1,433 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os + +import numpy as np +import torch +from omegaconf import DictConfig, ListConfig +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTEmbeddingDataset +from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( + get_datasets_weights_and_num_samples, +) +from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel +from nemo.utils import logging + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False +try: + + HAVE_APEX = True +except (ImportError, ModuleNotFoundError): + HAVE_APEX = False + + +def listify(tensor): + l_tensor = [] + for t in tensor: + for rid in range(t.shape[0]): + r = t[rid, :].unsqueeze(0).cpu() + l_tensor.append(r) + return l_tensor + + +class MegatronGPTEmbeddingModel(MegatronGPTSFTModel): + def __init__(self, cfg: DictConfig, trainer: Trainer): + super().__init__(cfg, trainer=trainer) + self.temperature = self.cfg.get('temperature', 0.02) + self.use_all_possible_negatives = self.cfg.get("use_all_possible_negatives", True) + self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True) + assert ( + self.cfg.get("post_process", False) is False + ), "post_process must be False to get hidden states in the loss_func" + + def model_provider_func(self, pre_process, post_process): + # (@adithyare) We need post_process to be False to get hidden states in the loss_func + return super().model_provider_func(pre_process, post_process=False) + + def maybe_setup_test(self): + if ( + hasattr(self.cfg.data, 'test_ds') + and self.cfg.data.test_ds.get('doc_file_names', None) is not None + and self.cfg.data.test_ds.get('query_file_names', None) is not None + ): + self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) + return + + def maybe_build_test(self): + if ( + hasattr(self.cfg.data, 'test_ds') + and self.cfg.data.test_ds.get('doc_file_names', None) is not None + and self.cfg.data.test_ds.get('query_file_names', None) is not None + ): + logging.info('Building GPT Embedder test datasets.') + # Wrap this in a list since the general finetuning parent class supports multi-validation. + self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False) + + def _build_dataset(self, data_cfg, is_train=True): + packed_sequence = data_cfg.get("packed_sequence", False) + + # Determine if we are using a single dataset or a list of datasets. + if is_train: + # Construct the data prefix list for `get_datasets_weights_and_num_samples()` + # that is of the format [weight1,file_name1,weight2,file_name2,...] + if data_cfg.concat_sampling_probabilities is None or not isinstance( + data_cfg.concat_sampling_probabilities, ListConfig + ): + raise ValueError( + ( + f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names." + f"Found: {data_cfg.concat_sampling_probabilities}" + ) + ) + + if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names): + raise ValueError( + ( + f"concat_sampling_probabilities must be of the same size as file_names.", + f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}", + ) + ) + + data_prefix = [] + for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names): + data_prefix.append(weight) + data_prefix.append(prefix) + + if self.trainer.max_steps is None or self.trainer.max_steps <= 0: + raise ValueError( + f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}' + ) + num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size] + _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples) + num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset]) + else: + num_query_samples_per_dataset = [[None]] * len(data_cfg.query_file_names) + num_doc_samples_per_dataset = [[None]] * len(data_cfg.doc_file_names) + + # Check dataset max_seq_legnth and max_position_embeddings size + if ( + self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute'] + and data_cfg.max_seq_length > self.cfg.max_position_embeddings + ): + logging.warning( + f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding" + ) + data_cfg.max_seq_length = self.cfg.max_position_embeddings + + # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 + # When using sequence parallel, sequence will further be split by TP size + pad_seq_length_to_mult = ( + 8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16 + ) + if is_train: + datasets = [] + for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset): + dataset = GPTEmbeddingDataset( + file_path=file_path, + tokenizer=self.tokenizer, + max_seq_length=data_cfg.max_seq_length, + min_seq_length=data_cfg.min_seq_length, + add_bos=data_cfg.get('add_bos', False), + add_eos=data_cfg.get('add_eos', True), + max_num_samples=num_samples[0], + seed=data_cfg.get('seed', 1234), + index_mapping_dir=data_cfg.get('index_mapping_dir', None), + virtual_tokens=self.virtual_tokens, + memmap_workers=data_cfg.get( + 'memmap_workers', None + ), # used to set num. of workers to create the memmap index files + truncation_method=data_cfg.get( + 'truncation_method', 'right' + ), # used to choose truncation method. Options: ['random', 'left', 'right'] + special_tokens=self.cfg.data.get( + 'chat_prompt_tokens', None + ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} + ) + datasets.append(dataset) + if packed_sequence: + raise NotImplementedError("Packed sequence is not supported for MegatronGPTEmbeddingModel") + + dataset = BlendableDataset( + datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend + ) + return dataset + else: + query_dataset = GPTEmbeddingDataset( + file_path=data_cfg.query_file_names[0], + tokenizer=self.tokenizer, + max_seq_length=data_cfg.max_seq_length, + min_seq_length=data_cfg.min_seq_length, + add_bos=data_cfg.get('add_bos', False), + add_eos=data_cfg.get('add_eos', True), + max_num_samples=None, + seed=data_cfg.get('seed', 1234), + index_mapping_dir=data_cfg.get('index_mapping_dir', None), + virtual_tokens=self.virtual_tokens, + memmap_workers=data_cfg.get( + 'memmap_workers', None + ), # used to set num. of workers to create the memmap index files + truncation_method=data_cfg.get( + 'truncation_method', 'right' + ), # used to choose truncation method. Options: ['random', 'left', 'right'] + special_tokens=self.cfg.data.get( + 'chat_prompt_tokens', None + ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} + data_type="query", + ) + doc_dataset = GPTEmbeddingDataset( + file_path=data_cfg.doc_file_names[0], + tokenizer=self.tokenizer, + max_seq_length=data_cfg.max_seq_length, + min_seq_length=data_cfg.min_seq_length, + add_bos=data_cfg.get('add_bos', False), + add_eos=data_cfg.get('add_eos', True), + max_num_samples=None, + seed=data_cfg.get('seed', 1234), + index_mapping_dir=data_cfg.get('index_mapping_dir', None), + virtual_tokens=self.virtual_tokens, + memmap_workers=data_cfg.get( + 'memmap_workers', None + ), # used to set num. of workers to create the memmap index files + truncation_method=data_cfg.get( + 'truncation_method', 'right' + ), # used to choose truncation method. Options: ['random', 'left', 'right'] + special_tokens=self.cfg.data.get( + 'chat_prompt_tokens', None + ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} + data_type="doc", + ) + return [query_dataset, doc_dataset] + + def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only): + loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only) + avg_pos_cs = non_loss_tensors['avg_pos_cs'][0].item() + avg_neg_cs = non_loss_tensors['avg_neg_cs'][0].item() + diff_cs = non_loss_tensors['diff_cs'][0].item() + self.log("avg_pos_cs", avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + self.log("avg_neg_cs", avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + self.log("diff_cs", diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + return loss_mean + + def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0): + metadata = batch.get('metadata', [{}] * len(batch['tokens'])) + loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch])) + outputs = { + 'loss': loss, + 'metadata': metadata, # [dict] + 'q_hs': non_loss_tensors['query_hs'], # [batch_size, hidden_size] + 'd_hs': non_loss_tensors['doc_hs'], # [batch_size, hidden_size] + } + return outputs + + def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0): + if not data_cfg.get("write_embeddings_to_file", False): + return True + gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())] + torch.distributed.all_gather_object( + gathered_output_batches, + [{'q_hs': batch['q_hs'], 'd_hs': batch['d_hs'], 'metadata': batch['metadata'],} for batch in output], + group=parallel_state.get_data_parallel_group(), + ) + + # Remove duplicate examples due to distributed sampler. + deduplicated_outputs = { + 'q_hs': [], + 'd_hs': [], + 'metadata': [], + } + total_size, skipped = 0, 0 + for rank in range(0, parallel_state.get_data_parallel_world_size()): + for batch in gathered_output_batches[rank]: + l_q_hs = listify(batch['q_hs']) + l_d_hs = listify(batch['d_hs']) + l_m = batch['metadata'] + assert len(l_m) == len(l_q_hs) == len(l_d_hs) + for q_hs, d_hs, metadata in zip(l_q_hs, l_d_hs, l_m,): + total_size += 1 + if not metadata.get("__AUTOGENERATED__", False): + deduplicated_outputs['q_hs'].append(q_hs) + deduplicated_outputs['d_hs'].append(d_hs) + deduplicated_outputs['metadata'].append(metadata) + else: + skipped += 1 + + logging.info( + f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)." + ) + # Compute metric score + metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name + assert metric_name == "loss", "Only loss is supported for now." + # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item() + # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item() + # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item() + # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + + # Write predictions to file + if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False): + logging.info( + f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}" + ) + + # Check if the user provided a prefix path to the file(s) they want to write. + if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: + raise ValueError( + f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." + ) + # (@adithyare) We are not using the log key to write the embeddings to file + filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode) + consumed_samples = self._compute_consumed_samples_after_training_step() + fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}" + self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx) + return deduplicated_outputs, total_size + + def write_embeddings_to_file(self, outputs, output_file_path, d_idx): + emb_type = 'query' if d_idx == 0 else 'doc' + hs = torch.cat(outputs['q_hs' if d_idx == 0 else 'd_hs'], dim=0) + hs_npy = hs.float().numpy() + emb_fldr = f"{output_file_path}" + os.makedirs(emb_fldr, exist_ok=True) + with open(f"{output_file_path}/{emb_type}.ids", "w") as f: + for m in outputs['metadata']: + f.write(m[f"{emb_type}_id"] + "\n") + np.save(f"{emb_fldr}/{emb_type}.npy", hs_npy) + return True + + def local_validation_step(self, dataloader_iter): + """ + Our dataloaders produce a micro-batch and then we fetch + a number of microbatches depending on the global batch size and model parallel size + from the dataloader to produce a list of microbatches. + The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. + """ + # Check if iterator is exhausted + # dataloader_iter, done = self._val_iterator_done(dataloader_iter) + # if done: + # return + # Get the dataloader_idx when MegatronGPTSFTModel calls validation_step of MegatronGPTModel + next_item_dataloader = next(dataloader_iter) + if isinstance(next_item_dataloader, int): + dataloader_idx = next_item_dataloader + else: + dataloader_iter = itertools.chain([next_item_dataloader], dataloader_iter) + mode = 'test' if self.trainer.testing else 'val' + # Initialize userbuffer communicators. + if self.initialize_ub: + self.initialize_ub_func() + + if isinstance(self.model, list): + for model_module in self.model: + model_module.eval() + + if self.cfg.get('fp8', False): + first_val_step = self.prev_step_training and not self.training + self.prev_step_training = self.training + else: + first_val_step = None + + loss, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, True, first_val_step) + + if isinstance(self.model, list): + for model_module in self.model: + model_module.train() + + if mode == 'val': + # MegatronGPTSFTModel class supports multiple dataloaders and uses validation_step of MegatronGPTModel. + # Supporting that case with below lines + if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: + self.validation_step_outputs[dataloader_idx].append(loss) + else: + self.validation_step_outputs.append(loss) + else: + if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: + self.test_step_outputs[dataloader_idx].append(loss) + else: + self.test_step_outputs.append(loss) + + return loss, non_loss_tensors + + def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, use_all_possible_negatives=False): + all_doc_hs = torch.cat([pos_doc_hs, neg_doc_hs], dim=0) # (2bs) x hidden_size + cs = torch.mm(query_hs, all_doc_hs.transpose(0, 1)) # (bs) x (2bs) + pos_cs = cs[:, :bs].diag() + neg_cs = cs[:, bs:].diag() + if use_all_possible_negatives: + labels = torch.arange(bs, device=cs.device).long() + else: + labels = torch.zeros(bs, device=cs.device).long() + cs = torch.cat([pos_cs.unsqueeze(1), neg_cs.unsqueeze(1)], dim=1) + pos_cs = pos_cs.clone().detach().mean() + neg_cs = neg_cs.clone().detach().mean() + return cs, pos_cs, neg_cs, labels + + def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors): + hs = eos_tensors + hs = torch.nn.functional.normalize(hs, dim=1) + _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0] + return _blank, hs, hs, _blank, _blank, _blank + + def _gather_global_inbatch_representations(self, local_eos_tensor): + local_eos_tensor = local_eos_tensor.contiguous() + global_eos_tensors = [ + torch.zeros_like(local_eos_tensor) for _ in range(parallel_state.get_data_parallel_world_size()) + ] + torch.distributed.all_gather( + global_eos_tensors, local_eos_tensor, group=parallel_state.get_data_parallel_group() + ) + global_eos_tensors[parallel_state.get_data_parallel_rank()] = local_eos_tensor + global_eos_tensors = torch.cat(global_eos_tensors, dim=0) + return global_eos_tensors + + def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): + idx = torch.arange(output_tensor.shape[1], device=output_tensor.device) + eos_tensors = output_tensor[loss_mask, idx, :] + if self.global_inbatch_negatives and self.trainer.training: + eos_tensors = self._gather_global_inbatch_representations(eos_tensors) + if not self.trainer.training: + return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors) + bs = eos_tensors.shape[0] // 3 + query_hs = eos_tensors[::3, :] # every third tensor is a query (bs x hidden_size) + pos_doc_hs = eos_tensors[1::3, :] # every third tensor is a positive doc (bs x hidden_size) + neg_doc_hs = eos_tensors[2::3, :] # every third tensor is a negative doc (bs x hidden_size) + + query_hs = torch.nn.functional.normalize(query_hs, dim=1) + pos_doc_hs = torch.nn.functional.normalize(pos_doc_hs, dim=1) + neg_doc_hs = torch.nn.functional.normalize(neg_doc_hs, dim=1) + + cs, pos_cs, neg_cs, labels = self.constrastive_scores( + pos_doc_hs, neg_doc_hs, query_hs, bs, self.use_all_possible_negatives + ) + cs = cs.clamp(-1.0, 1.0) + cs = cs / self.temperature + loss = torch.nn.functional.cross_entropy(cs, labels) + + cp_size = self.cfg.get('context_parallel_size', 1) + if cp_size > 1: + torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group()) + query_hs = query_hs.clone().detach() + pos_doc_hs = pos_doc_hs.clone().detach() + diff_cs = pos_cs - neg_cs + return loss, query_hs, pos_doc_hs, pos_cs, neg_cs, diff_cs diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index cd5587351ecd..803bc671a7cf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -895,6 +895,8 @@ def compute_consumed_samples(self, steps_since_resume=0): def _compute_consumed_samples_after_training_step(self): # Add +1 to account for the current batch, which is not counted yet in `trainer.global_step`. + if not hasattr(self, 'init_global_step'): + self.init_global_step = 0 # in case this method is called before training starts. return self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step) def _extract_consumed_samples_from_ckpt(self, ckpt_path): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 79d48269d3a6..c9aae27eb5ed 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -637,6 +637,14 @@ def initialize_ub_func(self): ) self.initialize_ub = False + def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only): + """ + This method is called from the training_step method. + It is separated out to allow for overriding in the MegatronGPTEmbeddingModel + """ + loss_mean = self.fwd_bwd_step(dataloader_iter, forward_only) + return loss_mean + def training_step(self, dataloader_iter): """ We pass the dataloader iterator function to the micro-batch scheduler. @@ -676,7 +684,7 @@ def training_step(self, dataloader_iter): for param in module.embedding.parameters(): param.data_ptr() - loss_mean = self.fwd_bwd_step(dataloader_iter, False) + loss_mean = self.training_step_fwd_bwd_step_call(dataloader_iter, forward_only=False) if self.cfg.get('fp8', False): self.prev_step_training = self.training @@ -1012,7 +1020,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ 'input_ids': batch['tokens'], 'position_ids': batch['position_ids'], 'attention_mask': None if self.get_attention_mask_from_fusion else batch['attention_mask'], - 'labels': batch['labels'], + 'labels': batch['labels'] if 'labels' in batch else None, 'loss_mask': batch['loss_mask'], } @@ -1056,8 +1064,27 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ def loss_func(output_tensor): # Loss for a micro-batch (ub) loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor) - cp_size = parallel_state.get_context_parallel_world_size() - if validation_step and not self.cfg.data.get('validation_drop_last', True): + cp_size = self.cfg.get('context_parallel_size', 1) + if self.cfg.data.get( + "return_output_tensors", False + ): # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare) + loss_for_ub, q_hs, d_hs, pos_cs, neg_cs, diff_cs = loss_for_ub + reduced_loss = average_losses_across_data_parallel_group([loss_for_ub]) + pos_cs = average_losses_across_data_parallel_group([pos_cs]) + neg_cs = average_losses_across_data_parallel_group([neg_cs]) + diff_cs = average_losses_across_data_parallel_group([diff_cs]) + return ( + loss_for_ub * cp_size, + { + 'avg': reduced_loss, + 'query_hs': q_hs, + 'doc_hs': d_hs, + 'avg_pos_cs': pos_cs, + 'avg_neg_cs': neg_cs, + 'diff_cs': diff_cs, + }, + ) + elif validation_step and not self.cfg.data.get('validation_drop_last', True): num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub'] if loss_for_ub.isnan(): assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input' diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 7ab00f1af85a..0320fc6c0713 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -97,6 +97,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self._reset_activation_checkpointing_args() self.virtual_tokens = 0 + self.init_global_step = 0 def setup_metric(self, data_cfg): metric_name = "exact_string_match" @@ -160,6 +161,11 @@ def setup_metric(self, data_cfg): def _metrics_require_string2category_map(self): return set(["f1", "accuracy", "average_precision"]) + def maybe_setup_test(self): + if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: + self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) + return + def setup(self, stage=None): # NOTE: super().__init__ will try and setup train/val/test datasets, but we sidestep this using a if self._train_ds is not None condition # We then set things up for real only once setup() of this class is called. @@ -182,8 +188,7 @@ def setup(self, stage=None): self.setup_training_dataloader() if hasattr(self, '_validation_ds'): self._validation_dl = self.setup_eval_dataloader(self._validation_ds, self.cfg.data.validation_ds) - if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: - self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) + self.maybe_setup_test() # when using pipeline model parallel the final stage need to initialize word embeddings self.initialize_last_rank_embeddings() @@ -369,8 +374,15 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): first_val_step=first_val_step, ) + non_loss_tensors = {} # only the last stages of the pipeline return losses if losses_reduced_per_micro_batch: + for item in losses_reduced_per_micro_batch: + for k, v in item.items(): + if k != 'avg': + av = non_loss_tensors.get(k, []) + av.append(v) + non_loss_tensors[k] = av if (not forward_only) or self.cfg.data.get('validation_drop_last', True): # average loss across micro batches loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch] @@ -396,7 +408,12 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): else: loss_mean = torch.tensor(0.0).cuda() - return loss_mean + # if forward_only: + # return loss_mean + if non_loss_tensors: # TODO: need a nicer way to do this via inheritance (@adithyare) + return loss_mean, non_loss_tensors + else: + return loss_mean def validation_step(self, dataloader_iter): return self.inference_step(dataloader_iter, 'validation') @@ -409,6 +426,23 @@ def inference_step(self, dataloader_iter, mode): data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds self._reconfigure_and_process_inference_batch(batch, data_cfg) # Meta data from dataset + outputs = self.inference_step_validation_call(batch, batch_idx, data_cfg, dataloader_idx) + + if mode == 'validation': + if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: + # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict + self.validation_step_outputs[dataloader_idx][-1] = outputs + else: + # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict + self.validation_step_outputs[-1] = outputs + else: + if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: + self.test_step_outputs[dataloader_idx][-1] = outputs + else: + self.test_step_outputs[-1] = outputs + return outputs + + def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0): metadata = batch.get('metadata', [{}] * len(batch['tokens'])) # Pass dataloader_idx, as it's needed in val_step of GPTModel to append the loss correctly to self.val/test_step_outputs # in case of multi dataloaders @@ -442,22 +476,91 @@ def inference_step(self, dataloader_iter, mode): 'inputs': inputs_text, # [str] 'metadata': metadata, # [dict] } + return outputs - if mode == 'validation': - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict - self.validation_step_outputs[dataloader_idx][-1] = outputs + def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0): + # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks. + gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())] + torch.distributed.all_gather_object( + gathered_outputs, + [ + {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']} + for x in output + ], + group=parallel_state.get_data_parallel_group(), + ) + + # Remove duplicate examples due to distributed sampler. + deduplicated_outputs = { + 'preds': [], + 'labels': [], + 'inputs': [], + 'metadata': [], + } + total_size = 0 + for rank in range(0, parallel_state.get_data_parallel_world_size()): + for batch in gathered_outputs[rank]: + for pred, label, input, metadata in zip( + batch['preds'], batch['labels'], batch['inputs'], batch['metadata'] + ): + total_size += 1 + if not metadata.get("__AUTOGENERATED__", False): + deduplicated_outputs['preds'].append(pred) + deduplicated_outputs['labels'].append(label) + deduplicated_outputs['inputs'].append(input) + deduplicated_outputs['metadata'].append(metadata) + else: + logging.info(f"skipping autogenerated example example {input} prediction {pred} label {label}") + + # Compute metric score + metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name + metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key + if metric_name != 'loss': + metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode) + metric_fn = self.val_metric[dataloader_idx] if mode == 'validation' else self.test_metric[dataloader_idx] + if metric_label_key in deduplicated_outputs['metadata'][0]: + labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']] else: - # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict - self.validation_step_outputs[-1] = outputs - else: - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx][-1] = outputs + labels = deduplicated_outputs['labels'] + + for pred, label in zip(deduplicated_outputs['preds'], labels): + _ = metric_fn(pred, label) + + metric_result = metric_fn.compute() + + if metric_name == 'rouge': + for k, v in metric_result.items(): + if 'fmeasure' in k: + self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True) + logging.info(f"{mode} {metric_name} {k}: {v.item()}") + metric_result = metric_result['rouge1_fmeasure'] else: - self.test_step_outputs[-1] = outputs - return outputs + self.log(metric_log_key, metric_result.item(), sync_dist=True) + logging.info(f"{mode} {metric_name}: {metric_result.item()}") + + metric_fn.reset() + averaged_metric.append(metric_result) + + # Write predictions to file + if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False): + logging.info( + f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}" + ) + + # Check if the user provided a prefix path to the file(s) they want to write. + if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: + raise ValueError( + f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." + ) + filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode) + self.write_predictions_to_file( + deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}" + ) + + return deduplicated_outputs, total_size def inference_epoch_end(self, outputs, mode, data_cfg): + # TODO: this method should be modularized. It is too long and does too many things. (@adithyare) # Parent class will handle logging of the loss. if not outputs or not outputs[0]: return @@ -487,92 +590,13 @@ def inference_epoch_end(self, outputs, mode, data_cfg): # we can only log on one rank if it is rank zero so we broadcast from last rank torch.distributed.broadcast(loss, get_last_rank()) - if mode != 'test': - self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1) + self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1) + # Determine the key used to log the loss based on the user provided name of the dataset or the dataloader index. + loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode) + self.log(loss_log_key, loss, batch_size=1) averaged_loss.append(loss) - - # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks. - gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())] - torch.distributed.all_gather_object( - gathered_outputs, - [ - {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']} - for x in output - ], - group=parallel_state.get_data_parallel_group(), - ) - - # Remove duplicate examples due to distributed sampler. - deduplicated_outputs = { - 'preds': [], - 'labels': [], - 'inputs': [], - 'metadata': [], - } - total_size = 0 - for rank in range(0, parallel_state.get_data_parallel_world_size()): - for batch in gathered_outputs[rank]: - for pred, label, input, metadata in zip( - batch['preds'], batch['labels'], batch['inputs'], batch['metadata'] - ): - total_size += 1 - if not metadata.get("__AUTOGENERATED__", False): - deduplicated_outputs['preds'].append(pred) - deduplicated_outputs['labels'].append(label) - deduplicated_outputs['inputs'].append(input) - deduplicated_outputs['metadata'].append(metadata) - else: - logging.info( - f"skipping autogenerated example example {input} prediction {pred} label {label}" - ) - - # Compute metric score - metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name - metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key - if metric_name != 'loss': - metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode) - metric_fn = ( - self.val_metric[dataloader_idx] if mode == 'validation' else self.test_metric[dataloader_idx] - ) - if metric_label_key in deduplicated_outputs['metadata'][0]: - labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']] - else: - labels = deduplicated_outputs['labels'] - - for pred, label in zip(deduplicated_outputs['preds'], labels): - _ = metric_fn(pred, label) - - metric_result = metric_fn.compute() - - if metric_name == 'rouge': - for k, v in metric_result.items(): - if 'fmeasure' in k: - self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True) - logging.info(f"{mode} {metric_name} {k}: {v.item()}") - metric_result = metric_result['rouge1_fmeasure'] - else: - self.log(metric_log_key, metric_result.item(), sync_dist=True) - logging.info(f"{mode} {metric_name}: {metric_result.item()}") - - metric_fn.reset() - averaged_metric.append(metric_result) - - # Write predictions to file - if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False): - logging.info( - f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}" - ) - - # Check if the user provided a prefix path to the file(s) they want to write. - if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: - raise ValueError( - f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." - ) - filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode) - self.write_predictions_to_file( - deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}" - ) + self.gather_and_maybe_write_predictions(output, data_cfg, mode, dataloader_idx) torch.distributed.barrier(group=parallel_state.get_data_parallel_group()) outputs[dataloader_idx].clear() # free memory @@ -759,6 +783,14 @@ def _reconfigure_and_process_inference_batch(self, batch, data_cfg): data_parallel_size=parallel_state.get_data_parallel_world_size(), ) + def maybe_build_test(self): + if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: + logging.info('Building GPT SFT test datasets.') + # Wrap this in a list since the general finetuning parent class supports multi-validation. + self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False) + logging.info(f'Length of test dataset: {len(self._test_ds[0])}') + return + def build_train_valid_test_datasets(self, stage): if stage != 'test': logging.info('Building GPT SFT validation datasets.') @@ -767,11 +799,7 @@ def build_train_valid_test_datasets(self, stage): logging.info(f'Length of val dataset: {len(self._validation_ds[0])}') if stage != 'validate': - if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: - logging.info('Building GPT SFT test datasets.') - # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False) - logging.info(f'Length of test dataset: {len(self._test_ds[0])}') + self.maybe_build_test() if stage == 'validate' or stage == 'test': return diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 38c887304f7a..e016022a6c44 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -708,8 +708,6 @@ def id_func(output_tensor): return fwd_output_only_func - ########## - def _test_validation_step(self, dataloader_iter): """ Shared code for validation and test step diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 26555cc3341a..7e4df2f27c6d 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -503,6 +503,7 @@ def merge_inference_cfg(cls, path: str, cfg: DictConfig) -> DictConfig: with open_dict(cfg): cfg.inference.add_BOS = peft_cfg.data.test_ds.add_bos - cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.tokens_to_generate + cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.get("tokens_to_generate", 1) + peft_cfg.megatron_amp_O2 = False # always evaluate with O1 return peft_cfg diff --git a/examples/nlp/information_retrieval/construct_random_negatives.py b/scripts/construct_random_negatives.py similarity index 100% rename from examples/nlp/information_retrieval/construct_random_negatives.py rename to scripts/construct_random_negatives.py diff --git a/examples/nlp/information_retrieval/get_msmarco.sh b/scripts/information_retrieval/get_msmarco.sh similarity index 100% rename from examples/nlp/information_retrieval/get_msmarco.sh rename to scripts/information_retrieval/get_msmarco.sh From 87489c87380827f8f408455f04a16b8e4ae9feeb Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Fri, 15 Mar 2024 03:34:22 +0200 Subject: [PATCH 024/140] add mcore updates (#8643) * add mcore updaates Signed-off-by: dimapihtar * update mcore version Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --- Jenkinsfile | 2 +- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++-- .../nlp/data/language_modeling/megatron/gpt_fim_dataset.py | 6 +++--- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 + 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index b278a53d8213..45c766a966d6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -99,7 +99,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \ + git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \ pip install .' } } diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index c9f8b8952d5e..368efc7b3b77 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -235,6 +235,7 @@ model: data_prefix: ??? index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix data_impl: mmap + mmap_bin_files: True splits_string: 900,50,50 seq_length: ${model.encoder_seq_length} skip_warmup: True @@ -247,8 +248,7 @@ model: no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled - exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem - mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset + exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem # Nsys profiling options nsys_profile: diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py index 17576bea4c75..20ebf555f0b5 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py @@ -16,7 +16,7 @@ import numpy as np from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig -from megatron.core.datasets.indexed_dataset import MMapIndexedDataset +from megatron.core.datasets.indexed_dataset import IndexedDataset from megatron.core.datasets.utils import Split @@ -41,7 +41,7 @@ class GPTFIMDataset(GPTDataset): """The base GPT dataset Args: - indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset indexed_indices (np.ndarray): The set of the documents indices to expose @@ -55,7 +55,7 @@ class GPTFIMDataset(GPTDataset): def __init__( self, - indexed_dataset: MMapIndexedDataset, + indexed_dataset: IndexedDataset, dataset_path: str, indexed_indices: np.ndarray, num_samples: int, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index c9aae27eb5ed..7cdb8b3abb37 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1302,6 +1302,7 @@ def build_train_valid_test_datasets(self): "reset_attention_mask": self.reset_attention_mask, "eod_mask_loss": self.eod_mask_loss, "mock": mock_dataset, + "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), } # support for dict data input type From 355e36c344be55b2bf7b1fd55f5554a831e6fcd3 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Fri, 15 Mar 2024 14:50:32 +0200 Subject: [PATCH 025/140] FSDP update to PTL 2.2 (#8658) * fsdp fix Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * return extra line Signed-off-by: dimapihtar * fix empty line Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- nemo/collections/nlp/parts/megatron_trainer_builder.py | 8 +++++++- nemo/collections/nlp/parts/nlp_overrides.py | 5 +++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 368efc7b3b77..79bd7c1473f5 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -145,7 +145,7 @@ model: # FSDP fsdp: False # Enable training with torch FSDP. fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'. - fsdp_grad_reduce_dtype: 'fp32' # Gradient reduction data type. + fsdp_grad_reduce_dtype: 32 # Gradient reduction data type. fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. ## Activation Checkpointing diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py index 055671219fb8..968674b0fb92 100644 --- a/nemo/collections/nlp/parts/megatron_trainer_builder.py +++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py @@ -19,6 +19,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.plugins.precision.fsdp import FSDPPrecision from nemo.collections.nlp.parts.nlp_overrides import ( CustomProgressBar, @@ -113,7 +114,12 @@ def _plugins(self) -> list: if megatron_amp_O2 and not with_distributed_adam: plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + if self.cfg.model.get('fsdp', False): + plugins.append(FSDPPrecision(precision=plugin_precision, scaler=scaler)) + else: + plugins.append( + PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler) + ) self.cfg.trainer.precision = None if self.cfg.get('cluster_type', None) == 'BCP': diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 66fa99ffefd1..0b117fd8d860 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -589,6 +589,7 @@ def __init__( self.nccl_communicator_config_path = nccl_communicator_config_path self.sharp = sharp + self.sharding_strategy = sharding_strategy super().__init__(**kwargs) def _set_mixed_precision_recipe( @@ -625,7 +626,7 @@ def setup_environment(self) -> None: if not parallel_state.model_parallel_is_initialized(): app_state = AppState() assert app_state.pipeline_model_parallel_size == 1, "FSDP does not support pipeline parallelism" - if self.kwargs['sharding_strategy'] == ShardingStrategy.HYBRID_SHARD: + if self.sharding_strategy == ShardingStrategy.HYBRID_SHARD: assert ( app_state.tensor_model_parallel_size == 1 ), "FSDP hybrid sharding cannot be used when tensor_model_parallel_size > 1." @@ -678,7 +679,7 @@ def optimizer_state(self, optimizer: torch.optim.Optimizer) -> Dict[str, torch.T optim_state_dict = FSDP.optim_state_dict(self.model, optimizer) return optim_state_dict - def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict=None) -> None: # Release strict state dict matching when using Megatron AMP-O2 to skip matching # half-precision module wrapper module. # TODO: Refactor this to be more generic. From e848378053958a98897cdeb58c5f3a20dfe08af8 Mon Sep 17 00:00:00 2001 From: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Date: Fri, 15 Mar 2024 18:52:57 +0400 Subject: [PATCH 026/140] Add CTC-WS documentation (#8470) * add ctcws doc Signed-off-by: andrusenkoau * add ctcws docs Signed-off-by: andrusenkoau * fixes Signed-off-by: andrusenkoau --------- Signed-off-by: andrusenkoau --- ...r_language_modeling_and_customization.rst} | 119 +++++++++++++++--- docs/source/asr/intro.rst | 2 +- 2 files changed, 100 insertions(+), 21 deletions(-) rename docs/source/asr/{asr_language_modeling.rst => asr_language_modeling_and_customization.rst} (88%) diff --git a/docs/source/asr/asr_language_modeling.rst b/docs/source/asr/asr_language_modeling_and_customization.rst similarity index 88% rename from docs/source/asr/asr_language_modeling.rst rename to docs/source/asr/asr_language_modeling_and_customization.rst index fc3b9c26effc..013b31dd28cd 100644 --- a/docs/source/asr/asr_language_modeling.rst +++ b/docs/source/asr/asr_language_modeling_and_customization.rst @@ -1,5 +1,5 @@ ##################### -ASR Language Modeling +ASR Language Modeling and Customization ##################### Language models have shown to help the accuracy of ASR models. NeMo supports the following two approaches to incorporate language models into the ASR models: @@ -472,6 +472,7 @@ You can then pass this file to your flashlight config object during decoding: decoding.beam.flashlight_cfg.beam_size_token = 32 \ decoding.beam.flashlight_cfg.beam_threshold = 25.0 + Combine N-gram Language Models ============================== @@ -526,22 +527,100 @@ The following is the list of the arguments for the opengrm script: | kenlm_bin_path | str | Required | The path to the bin folder of KenLM library. It is a folder named `bin` under where KenLM is installed. | +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ | ngram_bin_path | str | Required | The path to the bin folder of OpenGrm Ngram. It is a folder named `bin` under where OpenGrm Ngram is installed. | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| arpa_a | str | Required | Path to the ARPA N-gram model file A | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| alpha | float | Required | Weight of N-gram model A | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| arpa_b | int | Required | Path to the ARPA N-gram model file B | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| beta | float | Required | Weight of N-gram model B | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| out_path | str | Required | Path for writing temporary and resulting files. | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| test_file | str | None | Path to test file to count perplexity if provided. | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| symbols | str | None | Path to symbols (.syms) file. Could be calculated if it is not provided. | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| nemo_model_file | str | None | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model. | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ -| force | bool | ``False`` | Whether to recompile and rewrite all files | -+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+ ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| arpa_a | str | Required | Path to the ARPA N-gram model file A | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| alpha | float | Required | Weight of N-gram model A | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| arpa_b | int | Required | Path to the ARPA N-gram model file B | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| beta | float | Required | Weight of N-gram model B | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| out_path | str | Required | Path for writing temporary and resulting files. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| test_file | str | None | Path to test file to count perplexity if provided. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| symbols | str | None | Path to symbols (.syms) file. Could be calculated if it is not provided.| ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| nemo_model_file | str | None | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| force | bool | ``False`` | Whether to recompile and rewrite all files | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ + + +****************** +Context-biasing (word boosting) without external LM +****************** + +NeMo toolkit supports a fast context-biasing method for CTC and Transducer (RNN-T) ASR models with CTC-based Word Spotter. +The method involves decoding CTC log probabilities with a context graph built for words and phrases from the context-biasing list. +The spotted context-biasing candidates (with their scores and time intervals) are compared by scores with words from the greedy CTC decoding results to improve recognition accuracy and pretend false accepts of context-biasing. + +A Hybrid Transducer-CTC model (a shared encoder trained together with CTC and Transducer output heads) enables the use of the CTC-WS method for the Transducer model. +Context-biasing candidates obtained by CTC-WS are also filtered by the scores with greedy CTC predictions and then merged with greedy Transducer results. + +Scheme of the CTC-WS method: + +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_1.png + :align: center + :alt: CTC-WS scheme + :scale: 40% + +High-level overview of the context-biasing words replacement with CTC-WS method: + +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_2.png + :align: center + :alt: CTC-WS high level overview + :scale: 40% + +More details about CTC-WS context-biasing can be found in the `tutorial `__. + +To use CTC-WS context-biasing, you need to create a context-biasing text file that contains words/phrases to be boosted, with its transcriptions (spellings) separated by underscore. +Multiple transcriptions can be useful for abbreviations ("gpu" -> "g p u"), compound words ("nvlink" -> "nv link"), +or words with common mistakes in the case of our ASR model ("nvidia" -> "n video"). + +Example of the context-biasing file: + +.. code-block:: + + nvidia_nvidia + omniverse_omniverse + gpu_gpu_g p u + dgx_dgx_d g x_d gx + nvlink_nvlink_nv link + ray tracing_ray tracing + +The main script for CTC-WS context-biasing in NeMo is: + +.. code-block:: + + {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py + +Context-biasing is managed by ``apply_context_biasing`` parameter [true or false]. +Other important context-biasing parameters are: + +* ``beam_threshold`` - threshold for CTC-WS beam pruning +* ``context_score`` - per token weight for context biasing +* ``ctc_ali_token_weight`` - per token weight for CTC alignment (prevents false acceptances of context-biasing words) + +All the context-biasing parameters are selected according to the default values in the script. +You can tune them according to your data and ASR model (list all the values in the [] separated by commas) +for example: ``beam_threshold=[7.0,8.0,9.0]``, ``context_score=[3.0,4.0,5.0]``, ``ctc_ali_token_weight=[0.5,0.6,0.7]``. +The script will run the recognition with all the combinations of the parameters and will select the best one based on WER value. + +.. code-block:: + + # Context-biasing with the CTC-WS method for CTC ASR model + python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \ + nemo_model_file={ctc_model_name} \ + input_manifest={test_nemo_manifest} \ + preds_output_folder={exp_dir} \ + decoder_type="ctc" \ + acoustic_batch_size=64 \ + apply_context_biasing=true \ + context_file={cb_list_file_modified} \ + beam_threshold=[7.0] \ + context_score=[3.0] \ + ctc_ali_token_weight=[0.5] + +To use Transducer head of the Hybrid Transducer-CTC model, you need to set ``decoder_type=rnnt``. diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst index 540c26d71239..79d1f3e3e3f8 100644 --- a/docs/source/asr/intro.rst +++ b/docs/source/asr/intro.rst @@ -184,7 +184,7 @@ For more information, see additional sections in the ASR docs on the left-hand-s models datasets - asr_language_modeling + asr_language_modeling_and_customization results scores configs From 0fbfa211ee6681ff1937e0fe902d5460a0c5a342 Mon Sep 17 00:00:00 2001 From: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com> Date: Fri, 15 Mar 2024 10:38:11 -0700 Subject: [PATCH 027/140] Rearranged the order of asr models (#8653) * Changed the order of models in asr/models doc Signed-off-by: Jagadeesh Balam * Rearranged Conformer models Signed-off-by: Jagadeesh Balam --------- Signed-off-by: Jagadeesh Balam Signed-off-by: Jagadeesh Balam Co-authored-by: Jagadeesh Balam Co-authored-by: Jagadeesh Balam --- docs/source/asr/examples/kinyarwanda_asr.rst | 6 +- docs/source/asr/models.rst | 291 ++++++++++--------- 2 files changed, 154 insertions(+), 143 deletions(-) diff --git a/docs/source/asr/examples/kinyarwanda_asr.rst b/docs/source/asr/examples/kinyarwanda_asr.rst index bd1eac94e31f..f8057585b104 100644 --- a/docs/source/asr/examples/kinyarwanda_asr.rst +++ b/docs/source/asr/examples/kinyarwanda_asr.rst @@ -1,5 +1,9 @@ + +Example With MCV +================ + ######################################################################## -Example: Kinyarwanda ASR using Mozilla Common Voice Dataset +Kinyarwanda ASR using Mozilla Common Voice Dataset ######################################################################## In this example, we describe essential steps of training an ASR model for a new language (Kinyarwanda). Namely, diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 6b0087fd5f3d..cb7457b2d5d8 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -13,74 +13,12 @@ Pretrained checkpoints for all of these models, as well as instructions on how t section. You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. The checkpoints section also contains benchmark results for the available ASR models. -.. _Jasper_model: - -Jasper ------- - -Jasper ("Just Another Speech Recognizer") :cite:`asr-models-li2019jasper` is a deep time delay neural network (TDNN) comprising of -blocks of 1D-convolutional layers. The Jasper family of models are denoted as ``Jasper_[BxR]`` where ``B`` is the number of blocks -and ``R`` is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D convolution, batch normalization, -ReLU, and dropout: - - .. image:: images/jasper_vertical.png - :align: center - :alt: jasper model - :scale: 50% - -Jasper models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class. - -QuartzNet +.. _Conformer_model: +Conformer --------- - -QuartzNet :cite:`asr-models-kriman2019quartznet` is a version of Jasper :cite:`asr-models-li2019jasper` model with separable -convolutions and larger filters. It can achieve performance similar to Jasper but with an order of magnitude fewer parameters. -Similarly to Jasper, the QuartzNet family of models are denoted as ``QuartzNet_[BxR]`` where ``B`` is the number of blocks and ``R`` -is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization, -ReLU, and dropout: - - .. image:: images/quartz_vertical.png - :align: center - :alt: quartznet model - :scale: 40% - -QuartzNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class. - -.. _Citrinet_model: - -Citrinet --------- - -Citrinet is a version of QuartzNet :cite:`asr-models-kriman2019quartznet` that extends ContextNet :cite:`asr-models-han2020contextnet`, -utilizing subword encoding (via Word Piece tokenization) and Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to -obtain highly accurate audio transcripts while utilizing a non-autoregressive CTC based decoding scheme for efficient inference. - - .. image:: images/citrinet_vertical.png - :align: center - :alt: citrinet model - :scale: 50% - -Citrinet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class. - -.. _ContextNet_model: - -ContextNet ----------- - -ContextNet is a model uses Transducer/RNNT loss/decoder and is introduced in :cite:`asr-models-han2020contextnet`. -It uses Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to model larger context. -Unlike Citrinet, it has an autoregressive decoding scheme. - -ContextNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecRNNTBPEModel` class for a -model with sub-word encoding and :class:`~nemo.collections.asr.models.EncDecRNNTModel` for char-based encoding. - -You may find the example config files of ContextNet model with character-based encoding at -``/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml``. - .. _Conformer-CTC_model: - Conformer-CTC +~~~~~~~~~~~~~ ------------- Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a @@ -109,7 +47,7 @@ with sub-word encoding at ``/examples/asr/conf/conformer/conforme .. _Conformer-Transducer_model: Conformer-Transducer --------------------- +~~~~~~~~~~~~~~~~~~~~ Conformer-Transducer is the Conformer model introduced in :cite:`asr-models-gulati2020conformer` and uses RNNT/Transducer loss/decoder. It has the same encoder as Conformer-CTC but utilizes RNNT/Transducer loss/decoder which makes it an autoregressive model. @@ -128,6 +66,32 @@ You may find the example config files of Conformer-Transducer model with charact ``/examples/asr/conf/conformer/conformer_transducer_char.yaml`` and with sub-word encoding at ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml``. +.. _Conformer-HAT_model: + +Conformer-HAT +~~~~~~~~~~~~~ + +Conformer HAT (Hybrid Autoregressive Transducer) model (do not confuse it with Hybrid-Transducer-CTC) is a modification of Conformer-Transducer model based on this previous `work `_. +The main idea is to separate labels and blank score predictions, which allows to estimate the internal LM probabilities during decoding. +When external LM is available for inference, the internal LM can be subtracted from HAT model prediction in beamsearch decoding to improve external LM efficiency. +It can be helpful in the case of text-only adaptation for new domains. + +The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" `_ +class (instead of "RNNTJoint") for joint module. The all HAT logic is implemented in the "HATJiont" class. + + .. image:: images/hat.png + :align: center + :alt: HAT Model + :scale: 50% + +You may find the example config files of Conformer-HAT model with character-based encoding at +``/examples/asr/conf/conformer/hat/conformer_hat_char.yaml`` and +with sub-word encoding at ``/examples/asr/conf/conformer/hat/conformer_hat_bpe.yaml``. + +By default, the decoding for HAT model works in the same way as for Conformer-Transducer. +In the case of external ngram LM fusion you can use ``/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``. +To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy. + Fast-Conformer -------------- @@ -242,56 +206,6 @@ To include caching support, `model.set_export_config({'cache_support' : 'True'}) Or, if ``/scripts/export.py`` is being used: `python export.py cache_aware_conformer.nemo cache_aware_conformer.onnx --export-config cache_support=True` -.. _LSTM-Transducer_model: - -LSTM-Transducer ---------------- - -LSTM-Transducer is a model which uses RNNs (eg. LSTM) in the encoder. The architecture of this model is followed from suggestions in :cite:`asr-models-he2019streaming`. -It uses RNNT/Transducer loss/decoder. The encoder consists of RNN layers (LSTM as default) with lower projection size to increase the efficiency. -Layer norm is added between the layers to stabilize the training. -It can be trained/used in unidirectional or bidirectional mode. The unidirectional mode is fully causal and can be used easily for simple and efficient streaming. However the accuracy of this model is generally lower than other models like Conformer and Citrinet. - -This model supports both the sub-word level and character level encodings. You may find the example config file of RNNT model with wordpiece encoding at ``/examples/asr/conf/lstm/lstm_transducer_bpe.yaml``. -You can find more details on the config files for the RNNT models at `LSTM-Transducer <./configs.html#lstm-transducer>`_. - -.. _LSTM-CTC_model: - -LSTM-CTC --------- - -LSTM-CTC model is a CTC-variant of the LSTM-Transducer model which uses CTC loss/decoding instead of Transducer. -You may find the example config file of LSTM-CTC model with wordpiece encoding at ``/examples/asr/conf/lstm/lstm_ctc_bpe.yaml``. - -.. _Squeezeformer-CTC_model: - -Squeezeformer-CTC ------------------ - -Squeezeformer-CTC is a CTC-based variant of the Squeezeformer model introduced in :cite:`asr-models-kim2022squeezeformer`. Squeezeformer-CTC has a -similar encoder as the original Squeezeformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. The vast majority of the architecture is similar to Conformer model, so please refer to `Conformer-CTC <./models.html#conformer-ctc>`_. - -The model primarily differs from Conformer in the following ways : - -* Temporal U-Net style time reduction, effectively reducing memory consumption and FLOPs for execution. -* Unified activations throughout the model. -* Simplification of module structure, removal of redundant layers. - -Here is the overall architecture of the encoder of Squeezeformer-CTC: - - .. image:: images/squeezeformer.png - :align: center - :alt: Squeezeformer-CTC Model - :scale: 50% - -This model supports both the sub-word level and character level encodings. You can find more details on the config files for the -Squeezeformer-CTC models at `Squeezeformer-CTC <./configs.html#squeezeformer-ctc>`_. The variant with sub-word encoding is a BPE-based model -which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class, while the -character-based variant is based on :class:`~nemo.collections.asr.models.EncDecCTCModel`. - -You may find the example config files of Squeezeformer-CTC model with character-based encoding at -``/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml``. .. _Hybrid-Transducer_CTC_model: @@ -323,32 +237,6 @@ To export as CTC (single encoder+decoder graph), `model.set_export_config({'deco Or, if ``/scripts/export.py`` is being used: `python export.py hybrid_transducer.nemo hybrid_transducer.onnx --export-config decoder_type=ctc` -.. _Conformer-HAT_model: - -Conformer-HAT (Hybrid Autoregressive Transducer) ------------------------------------------------- -Conformer HAT model (do not confuse it with Hybrid-Transducer-CTC) is a modification of Conformer-Transducer model based on `Google paper `_. -The main idea is to separate labels and blank score predictions, which allows to estimate the internal LM probabilities during decoding. -When external LM is available for inference, the internal LM can be subtracted from HAT model prediction in beamsearch decoding to improve external LM efficiency. -It can be helpful in the case of text-only adaptation for new domains. - -The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" `_ -class (instead of "RNNTJoint") for joint module. The all HAT logic is implemented in the "HATJiont" class. - - .. image:: images/hat.png - :align: center - :alt: HAT Model - :scale: 50% - -You may find the example config files of Conformer-HAT model with character-based encoding at -``/examples/asr/conf/conformer/hat/conformer_hat_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/conformer/hat/conformer_hat_bpe.yaml``. - -By default, the decoding for HAT model works in the same way as for Conformer-Transducer. -In the case of external ngram LM fusion you can use ``/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``. -To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy. - - .. _Hybrid-ASR-TTS_model: Hybrid ASR-TTS Model @@ -406,6 +294,125 @@ A typical workflow to create and use the ensemble is like this Note that the ensemble cannot be modified after construction (e.g. it does not support finetuning) and only transcribe functionality is supported (e.g., ``.forward()`` is not properly defined). +.. _Jasper_model: + +Jasper +------ + +Jasper ("Just Another Speech Recognizer") :cite:`asr-models-li2019jasper` is a deep time delay neural network (TDNN) comprising of +blocks of 1D-convolutional layers. The Jasper family of models are denoted as ``Jasper_[BxR]`` where ``B`` is the number of blocks +and ``R`` is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D convolution, batch normalization, +ReLU, and dropout: + + .. image:: images/jasper_vertical.png + :align: center + :alt: jasper model + :scale: 50% + +Jasper models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class. + +.. _Quartznet_model: + +QuartzNet +--------- + +QuartzNet :cite:`asr-models-kriman2019quartznet` is a version of Jasper :cite:`asr-models-li2019jasper` model with separable +convolutions and larger filters. It can achieve performance similar to Jasper but with an order of magnitude fewer parameters. +Similarly to Jasper, the QuartzNet family of models are denoted as ``QuartzNet_[BxR]`` where ``B`` is the number of blocks and ``R`` +is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization, +ReLU, and dropout: + + .. image:: images/quartz_vertical.png + :align: center + :alt: quartznet model + :scale: 40% + +QuartzNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class. + + +.. _Citrinet_model: + +Citrinet +-------- + +Citrinet is a version of QuartzNet :cite:`asr-models-kriman2019quartznet` that extends ContextNet :cite:`asr-models-han2020contextnet`, +utilizing subword encoding (via Word Piece tokenization) and Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to +obtain highly accurate audio transcripts while utilizing a non-autoregressive CTC based decoding scheme for efficient inference. + + .. image:: images/citrinet_vertical.png + :align: center + :alt: citrinet model + :scale: 50% + +Citrinet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class. + +.. _ContextNet_model: + +ContextNet +---------- + +ContextNet is a model uses Transducer/RNNT loss/decoder and is introduced in :cite:`asr-models-han2020contextnet`. +It uses Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to model larger context. +Unlike Citrinet, it has an autoregressive decoding scheme. + +ContextNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecRNNTBPEModel` class for a +model with sub-word encoding and :class:`~nemo.collections.asr.models.EncDecRNNTModel` for char-based encoding. + +You may find the example config files of ContextNet model with character-based encoding at +``/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml`` and +with sub-word encoding at ``/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml``. + +.. _Squeezeformer-CTC_model: + +Squeezeformer-CTC +----------------- + +Squeezeformer-CTC is a CTC-based variant of the Squeezeformer model introduced in :cite:`asr-models-kim2022squeezeformer`. Squeezeformer-CTC has a +similar encoder as the original Squeezeformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. The vast majority of the architecture is similar to Conformer model, so please refer to `Conformer-CTC <./models.html#conformer-ctc>`_. + +The model primarily differs from Conformer in the following ways : + +* Temporal U-Net style time reduction, effectively reducing memory consumption and FLOPs for execution. +* Unified activations throughout the model. +* Simplification of module structure, removal of redundant layers. + +Here is the overall architecture of the encoder of Squeezeformer-CTC: + + .. image:: images/squeezeformer.png + :align: center + :alt: Squeezeformer-CTC Model + :scale: 50% + +This model supports both the sub-word level and character level encodings. You can find more details on the config files for the +Squeezeformer-CTC models at `Squeezeformer-CTC <./configs.html#squeezeformer-ctc>`_. The variant with sub-word encoding is a BPE-based model +which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class, while the +character-based variant is based on :class:`~nemo.collections.asr.models.EncDecCTCModel`. + +You may find the example config files of Squeezeformer-CTC model with character-based encoding at +``/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and +with sub-word encoding at ``/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml``. + +.. _LSTM-Transducer_model: + +LSTM-Transducer +--------------- + +LSTM-Transducer is a model which uses RNNs (eg. LSTM) in the encoder. The architecture of this model is followed from suggestions in :cite:`asr-models-he2019streaming`. +It uses RNNT/Transducer loss/decoder. The encoder consists of RNN layers (LSTM as default) with lower projection size to increase the efficiency. +Layer norm is added between the layers to stabilize the training. +It can be trained/used in unidirectional or bidirectional mode. The unidirectional mode is fully causal and can be used easily for simple and efficient streaming. However the accuracy of this model is generally lower than other models like Conformer and Citrinet. + +This model supports both the sub-word level and character level encodings. You may find the example config file of RNNT model with wordpiece encoding at ``/examples/asr/conf/lstm/lstm_transducer_bpe.yaml``. +You can find more details on the config files for the RNNT models at `LSTM-Transducer <./configs.html#lstm-transducer>`_. + +.. _LSTM-CTC_model: + +LSTM-CTC +-------- + +LSTM-CTC model is a CTC-variant of the LSTM-Transducer model which uses CTC loss/decoding instead of Transducer. +You may find the example config file of LSTM-CTC model with wordpiece encoding at ``/examples/asr/conf/lstm/lstm_ctc_bpe.yaml``. + References ---------- From 0675b9dca908bfe3b7c4be838af5c7a0640b14c1 Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Fri, 15 Mar 2024 12:22:57 -0700 Subject: [PATCH 028/140] Add generic check for dataloader_iter with PTL 2.2 (#8647) Signed-off-by: Abhishree Co-authored-by: Pablo Garay --- .../language_modeling/megatron_gpt_model.py | 22 +++++++------------ .../megatron_lm_encoder_decoder_model.py | 11 ++++------ .../megatron_t5_sft_model.py | 11 ++++------ .../machine_translation/megatron_nmt_model.py | 11 ++++------ 4 files changed, 20 insertions(+), 35 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 7cdb8b3abb37..c44f95fccad4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -936,13 +936,10 @@ def get_batch(self, data_iterator, tuning): # Broadcast data. if data_iterator is not None: - # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple - # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch - # from the data_iterator - if isinstance(data_iterator, _DataFetcherWrapper): - data, _, _ = next(data_iterator) - else: - data = next(data_iterator) + # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx + data = next(data_iterator) + if isinstance(data, tuple): + data = data[0] else: data = None @@ -1113,13 +1110,10 @@ def loss_func(output_tensor): def get_forward_output_only_func(self): def fwd_output_only_func(dataloader_iter, model): - # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple - # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch - # from the data_iterator - if isinstance(dataloader_iter, _DataFetcherWrapper): - batch, _, _ = next(dataloader_iter) - else: - batch = next(dataloader_iter) + # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx + batch = next(dataloader_iter) + if isinstance(batch, tuple): + batch = batch[0] extra_arg = {} if len(batch) == 3: batch = [x.cuda() for x in batch] diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index e016022a6c44..651034c91520 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -567,13 +567,10 @@ def _process_batch(self, global_batch: Dict[str, torch.Tensor]) -> List[torch.Te def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(dataloader_iter, model): - # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple - # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch - # from the data_iterator - if isinstance(dataloader_iter, _DataFetcherWrapper): - batch, _, _ = next(dataloader_iter) - else: - batch = next(dataloader_iter) + # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx + batch = next(dataloader_iter) + if isinstance(batch, tuple): + batch = batch[0] # convert to list if not already converted. if isinstance(batch, dict): # convert to list if not already converted. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py index 0b32530668be..2344dac3a64a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py @@ -293,13 +293,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only): Dataloader produces a global batch which is turned into a list of microbatches. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ - # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple - # from the dataloader_iter are already extracted in the child class. In that case extact only the batch - # from the data_iterator - if isinstance(dataloader_iter, _DataFetcherWrapper): - batch, _, _ = next(dataloader_iter) - else: - batch = next(dataloader_iter) + # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx + batch = next(dataloader_iter) + if isinstance(batch, tuple): + batch = batch[0] if isinstance(batch, dict): # convert to list if not already converted. batch = self._process_batch(batch) diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index 952c76ce929e..5a41682a4b5b 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -292,13 +292,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only): Dataloader produces a global batch which is turned into a list of microbatches. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ - # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple - # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch - # from the data_iterator - if isinstance(dataloader_iter, _DataFetcherWrapper): - batch, _, _ = next(dataloader_iter) - else: - batch = next(dataloader_iter) + # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx + batch = next(dataloader_iter) + if isinstance(batch, tuple): + batch = batch[0] if isinstance(batch, dict): # convert to list if not already converted. batch = self._process_batch(batch) From 9f8137458be72a75f3ca262355ae75211f929d85 Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Fri, 15 Mar 2024 12:51:38 -0700 Subject: [PATCH 029/140] Docs updates: compress NeMo core, add ASR Model spotlight (#8671) * make core_index.rst file Signed-off-by: Elena Rastorgueva * rename title from Tasks to NLP Tasks Signed-off-by: Elena Rastorgueva * move neural modules section to own RST file Signed-off-by: Elena Rastorgueva * add asr Spotlight Models Signed-off-by: Elena Rastorgueva * replace embedded hf space with canary one Signed-off-by: Elena Rastorgueva --------- Signed-off-by: Elena Rastorgueva --- docs/source/asr/asr_all.bib | 8 +++ docs/source/asr/intro.rst | 8 ++- docs/source/asr/models.rst | 34 +++++++++++ docs/source/core/core.rst | 90 ----------------------------- docs/source/core/core_index.rst | 37 ++++++++++++ docs/source/core/neural_modules.rst | 88 ++++++++++++++++++++++++++++ docs/source/index.rst | 7 +-- docs/source/nlp/models.rst | 4 +- 8 files changed, 176 insertions(+), 100 deletions(-) create mode 100644 docs/source/core/core_index.rst create mode 100644 docs/source/core/neural_modules.rst diff --git a/docs/source/asr/asr_all.bib b/docs/source/asr/asr_all.bib index 17caa233013e..11998d30cd5e 100644 --- a/docs/source/asr/asr_all.bib +++ b/docs/source/asr/asr_all.bib @@ -1033,3 +1033,11 @@ @misc{park2022multi year = {2022}, copyright = {Creative Commons Attribution 4.0 International} } + +@inproceedings{vaswani2017attention, + title={Attention is all you need}, + author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, + booktitle={Advances in Neural Information Processing Systems}, + pages={6000--6010}, + year={2017} +} \ No newline at end of file diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst index 79d1f3e3e3f8..8a244c3ea28d 100644 --- a/docs/source/asr/intro.rst +++ b/docs/source/asr/intro.rst @@ -148,7 +148,11 @@ There is also more information about the ASR model architectures available in Ne Try out NeMo ASR transcription in your browser ---------------------------------------------- -You can try out transcription with NeMo ASR models without leaving your browser, by using the HuggingFace Space embedded below. +You can try out transcription with a NeMo ASR model without leaving your browser, by using the HuggingFace Space embedded below. + +This HuggingFace Space uses `Canary-1B `__, the latest ASR model from NVIDIA NeMo. It sits at the top of the `HuggingFace OpenASR Leaderboard `__ at time of publishing. + +Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) as well as translation between English and the 3 other supported languages. .. raw:: html @@ -184,7 +188,7 @@ For more information, see additional sections in the ASR docs on the left-hand-s models datasets - asr_language_modeling_and_customization + asr_language_modeling results scores configs diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index cb7457b2d5d8..4f05cec410fa 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -13,6 +13,38 @@ Pretrained checkpoints for all of these models, as well as instructions on how t section. You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. The checkpoints section also contains benchmark results for the available ASR models. + +Spotlight Models +---------------- + +Canary +~~~~~~ + +Canary-1B is the latest ASR model from NVIDIA NeMo. It sits at the top of the `HuggingFace OpenASR Leaderboard `__ at time of publishing. + +You can `download the checkpoint `__ or try out Canary in action in this `HuggingFace Space `__. + +Canary-1B is an encoder-decoder model with a :ref:`FastConformer Encoder ` and Transformer Decoder :cite:`asr-models-vaswani2017attention`. + +It is a multi-lingual, multi-task model, supporting automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) as well as translation between English and the 3 other supported languages. + + +Parakeet +~~~~~~~~ + +Parakeet is the name of a family of ASR models with a :ref:`FastConformer Encoder ` and a CTC, RNN-T, or TDT decoder. + +Model checkpoints: + +* `Parakeet-CTC-0.6B `__ and `Parakeet-CTC-1.1B `__ model cards +* `Parakeet-RNNT-0.6B `__ and `Parakeet-RNNT-1.1B `__ model cards +* `Parakeet-TDT-1.1B `__ model card + +HuggingFace Spaces to try out Parakeet models in your browser: + +* `Parakeet-RNNT-1.1B `__ space +* `Parakeet-TDT-1.1B `__ space + .. _Conformer_model: Conformer --------- @@ -92,6 +124,8 @@ By default, the decoding for HAT model works in the same way as for Conformer-Tr In the case of external ngram LM fusion you can use ``/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``. To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy. +.. _Fast-Conformer: + Fast-Conformer -------------- diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index a71495800216..7fe4a65cc32f 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -740,93 +740,3 @@ To register a child model, use the ``register_nemo_submodule`` method of the par else: self.child_model = None - -Neural Modules -============== - -NeMo is built around Neural Modules, conceptual blocks of neural networks that take typed inputs and produce typed outputs. Such -modules typically represent data layers, encoders, decoders, language models, loss functions, or methods of combining activations. -NeMo makes it easy to combine and re-use these building blocks while providing a level of semantic correctness checking via its neural -type system. - -.. note:: *All Neural Modules inherit from ``torch.nn.Module`` and are therefore compatible with the PyTorch ecosystem.* - -There are 3 types on Neural Modules: - - - Regular modules - - Dataset/IterableDataset - - Losses - -Every Neural Module in NeMo must inherit from `nemo.core.classes.module.NeuralModule` class. - -.. autoclass:: nemo.core.classes.module.NeuralModule - -Every Neural Modules inherits the ``nemo.core.classes.common.Typing`` interface and needs to define neural types for its inputs and outputs. -This is done by defining two properties: ``input_types`` and ``output_types``. Each property should return an ordered dictionary of -"port name"->"port neural type" pairs. Here is the example from :class:`~nemo.collections.asr.modules.ConvASREncoder` class: - -.. code-block:: python - - @property - def input_types(self): - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - @typecheck() - def forward(self, audio_signal, length=None): - ... - -The code snippet above means that ``nemo.collections.asr.modules.conv_asr.ConvASREncoder`` expects two arguments: - * First one, named ``audio_signal`` of shape ``[batch, dimension, time]`` with elements representing spectrogram values. - * Second one, named ``length`` of shape ``[batch]`` with elements representing lengths of corresponding signals. - -It also means that ``.forward(...)`` and ``__call__(...)`` methods each produce two outputs: - * First one, of shape ``[batch, dimension, time]`` but with elements representing encoded representation (``AcousticEncodedRepresentation`` class). - * Second one, of shape ``[batch]``, corresponding to their lengths. - -.. tip:: It is a good practice to define types and add ``@typecheck()`` decorator to your ``.forward()`` method after your module is ready for use by others. - -.. note:: The outputs of ``.forward(...)`` method will always be of type ``torch.Tensor`` or container of tensors and will work with any other Pytorch code. The type information is attached to every output tensor. If tensors without types is passed to your module, it will not fail, however the types will not be checked. Thus, it is recommended to define input/output types for all your modules, starting with data layers and add ``@typecheck()`` decorator to them. - -.. note:: To temporarily disable typechecking, you can enclose your code in ```with typecheck.disable_checks():``` statement. - - -Dynamic Layer Freezing ----------------------- - -You can selectively freeze any modules inside a Nemo model by specifying a freezing schedule in the config yaml. Freezing stops any gradient updates -to that module, so that its weights are not changed for that step. This can be useful for combatting catastrophic forgetting, for example -when finetuning a large pretrained model on a small dataset. - -The default approach is to freeze a module for the first N training steps, but you can also enable freezing for a specific range of steps, -for example, from step 20 - 100, or even activate freezing from some N until the end of training. You can also freeze a module for the entire training run. -Dynamic freezing is specified in training steps, not epochs. - -To enable freezing, add the following to your config: - -.. code-block:: yaml - - model: - ... - freeze_updates: - enabled: true # set to false if you want to disable freezing - - modules: # list all of the modules you want to have freezing logic for - encoder: 200 # module will be frozen for the first 200 training steps - decoder: [50, -1] # module will be frozen at step 50 and will remain frozen until training ends - joint: [10, 100] # module will be frozen between step 10 and step 100 (step >= 10 and step <= 100) - transcoder: -1 # module will be frozen for the entire training run - diff --git a/docs/source/core/core_index.rst b/docs/source/core/core_index.rst new file mode 100644 index 000000000000..28cd149bdcb5 --- /dev/null +++ b/docs/source/core/core_index.rst @@ -0,0 +1,37 @@ +========= +NeMo Core +========= + +You can learn more about the underlying principles of the NeMo codebase in this section. + +The `NeMo Framework codebase `__ is composed of a `core `__ section which contains the main building blocks of the framework, and various `collections `__ which help you +build specialized AI models. + +You can learn more about aspects of the NeMo "core" by following the links below: + +.. toctree:: + :maxdepth: 1 + :name: core + :titlesonly: + + core + neural_modules + exp_manager + neural_types + export + adapters/intro + api + + + +Alternatively, you can jump straight to the documentation for the individual collections: + +* :doc:`Large Language Models (LLMs) <../nlp/nemo_megatron/intro>` + +* :doc:`Automatic Speech Recognition (ASR) <../asr/intro>` + +* :doc:`Multimodal (MM) Models <../multimodal/mllm/intro>` + +* :doc:`Text-to-Speech (TTS) <../tts/intro>` + +* :doc:`Computer Vision (CV) <../vision/intro>` diff --git a/docs/source/core/neural_modules.rst b/docs/source/core/neural_modules.rst new file mode 100644 index 000000000000..fbeec5440d01 --- /dev/null +++ b/docs/source/core/neural_modules.rst @@ -0,0 +1,88 @@ +Neural Modules +============== + +NeMo is built around Neural Modules, conceptual blocks of neural networks that take typed inputs and produce typed outputs. Such +modules typically represent data layers, encoders, decoders, language models, loss functions, or methods of combining activations. +NeMo makes it easy to combine and re-use these building blocks while providing a level of semantic correctness checking via its neural +type system. + +.. note:: *All Neural Modules inherit from ``torch.nn.Module`` and are therefore compatible with the PyTorch ecosystem.* + +There are 3 types on Neural Modules: + + - Regular modules + - Dataset/IterableDataset + - Losses + +Every Neural Module in NeMo must inherit from `nemo.core.classes.module.NeuralModule` class. + +.. autoclass:: nemo.core.classes.module.NeuralModule + +Every Neural Modules inherits the ``nemo.core.classes.common.Typing`` interface and needs to define neural types for its inputs and outputs. +This is done by defining two properties: ``input_types`` and ``output_types``. Each property should return an ordered dictionary of +"port name"->"port neural type" pairs. Here is the example from :class:`~nemo.collections.asr.modules.ConvASREncoder` class: + +.. code-block:: python + + @property + def input_types(self): + return OrderedDict( + { + "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), + "length": NeuralType(tuple('B'), LengthsType()), + } + ) + + @property + def output_types(self): + return OrderedDict( + { + "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), + "encoded_lengths": NeuralType(tuple('B'), LengthsType()), + } + ) + + @typecheck() + def forward(self, audio_signal, length=None): + ... + +The code snippet above means that ``nemo.collections.asr.modules.conv_asr.ConvASREncoder`` expects two arguments: + * First one, named ``audio_signal`` of shape ``[batch, dimension, time]`` with elements representing spectrogram values. + * Second one, named ``length`` of shape ``[batch]`` with elements representing lengths of corresponding signals. + +It also means that ``.forward(...)`` and ``__call__(...)`` methods each produce two outputs: + * First one, of shape ``[batch, dimension, time]`` but with elements representing encoded representation (``AcousticEncodedRepresentation`` class). + * Second one, of shape ``[batch]``, corresponding to their lengths. + +.. tip:: It is a good practice to define types and add ``@typecheck()`` decorator to your ``.forward()`` method after your module is ready for use by others. + +.. note:: The outputs of ``.forward(...)`` method will always be of type ``torch.Tensor`` or container of tensors and will work with any other Pytorch code. The type information is attached to every output tensor. If tensors without types is passed to your module, it will not fail, however the types will not be checked. Thus, it is recommended to define input/output types for all your modules, starting with data layers and add ``@typecheck()`` decorator to them. + +.. note:: To temporarily disable typechecking, you can enclose your code in ```with typecheck.disable_checks():``` statement. + + +Dynamic Layer Freezing +---------------------- + +You can selectively freeze any modules inside a Nemo model by specifying a freezing schedule in the config yaml. Freezing stops any gradient updates +to that module, so that its weights are not changed for that step. This can be useful for combatting catastrophic forgetting, for example +when finetuning a large pretrained model on a small dataset. + +The default approach is to freeze a module for the first N training steps, but you can also enable freezing for a specific range of steps, +for example, from step 20 - 100, or even activate freezing from some N until the end of training. You can also freeze a module for the entire training run. +Dynamic freezing is specified in training steps, not epochs. + +To enable freezing, add the following to your config: + +.. code-block:: yaml + + model: + ... + freeze_updates: + enabled: true # set to false if you want to disable freezing + + modules: # list all of the modules you want to have freezing logic for + encoder: 200 # module will be frozen for the first 200 training steps + decoder: [50, -1] # module will be frozen at step 50 and will remain frozen until training ends + joint: [10, 100] # module will be frozen between step 10 and step 100 (step >= 10 and step <= 100) + transcoder: -1 # module will be frozen for the entire training run diff --git a/docs/source/index.rst b/docs/source/index.rst index 9b62174ecbe2..7bf97cb779c3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -40,12 +40,7 @@ For more information, browse the developer docs for your area of interest in the :name: core :titlesonly: - core/core - core/exp_manager - core/neural_types - core/export - core/adapters/intro - core/api + core/core_index .. toctree:: diff --git a/docs/source/nlp/models.rst b/docs/source/nlp/models.rst index ad50d976db9f..2654cfca26d8 100755 --- a/docs/source/nlp/models.rst +++ b/docs/source/nlp/models.rst @@ -1,7 +1,7 @@ .. _nlp_models: -Tasks -===== +NLP Tasks +========= NeMo's NLP collection supports provides the following task-specific models: From bd958aab80f85896e0cb2234426f35c381230cd9 Mon Sep 17 00:00:00 2001 From: Valerie Sarge Date: Fri, 15 Mar 2024 13:34:47 -0700 Subject: [PATCH 030/140] Add option to write nemo checkpoint as loose files instead of compacted .nemo from megatron_ckpt_to_nemo.py (#8641) Signed-off-by: Valerie Sarge Co-authored-by: Eric Harper --- .../nlp/language_modeling/megatron_ckpt_to_nemo.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index c58ae7f156eb..40ba35f819ef 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -78,6 +78,11 @@ def get_args(): help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", ) parser.add_argument("--nemo_file_path", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument( + "--no_pack_nemo_file", + action="store_true", + help="If passed, output will be written under nemo_file_path as a directory instead of packed as a tarred .nemo file.", + ) parser.add_argument("--gpus_per_node", type=int, required=True, default=None) parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None) parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None) @@ -215,11 +220,17 @@ def convert(local_rank, rank, world_size, args): checkpoint_path, hparams_file=args.hparams_file, trainer=trainer ) model._save_restore_connector = NLPSaveRestoreConnector() + save_file_path = args.nemo_file_path + if args.no_pack_nemo_file: + # With --no_pack_nemo_file, nemo_file_path is expected to be a directory. + # Adding a dummy model filename here conforms with SaveRestoreConnector's convention. + model._save_restore_connector.pack_nemo_file = False + save_file_path = os.path.join(save_file_path, 'model.nemo') if torch.distributed.is_initialized(): torch.distributed.barrier() - model.save_to(args.nemo_file_path) + model.save_to(save_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}') From 24f6f9606e8db238669209664bfaa601b590aa46 Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Fri, 15 Mar 2024 15:20:38 -0700 Subject: [PATCH 031/140] update hf space to be canary (#8675) Signed-off-by: Elena Rastorgueva --- docs/source/asr/intro.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst index 8a244c3ea28d..d8fe1f105caf 100644 --- a/docs/source/asr/intro.rst +++ b/docs/source/asr/intro.rst @@ -156,7 +156,7 @@ Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-t .. raw:: html -