Merge branch 'NVIDIA:main' into neural_tn

NVIDIA · Jun 28, 2021 · 1577ddb · 1577ddb
2 parents 2ef1085 + 5fcfa9e
commit 1577ddb
Show file tree

Hide file tree

Showing 54 changed files with 353 additions and 296 deletions.
diff --git a/.github/blossom-ci.yml b/.github/blossom-ci.yml
@@ -0,0 +1,105 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: Blossom-CI
+on:
+  issue_comment:
+    types: [created]
+  workflow_dispatch:
+      inputs:
+          platform:
+            description: 'runs-on argument'     
+            required: false
+          args:
+            description: 'argument'     
+            required: false
+jobs:
+  Authorization:
+    name: Authorization
+    runs-on: blossom 
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: |
+         contains( 'ravitestgit,', format('{0},', github.actor)) && 
+         github.event.comment.body == '/blossom-ci'  
+    steps:
+      - name: Check if comment is issued by authorized person
+        run: blossom-ci
+        env:
+          OPERATION: 'AUTH'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
+          lfs: 'true'
+
+      # repo specific steps 
+      #- name: Setup java
+      #  uses: actions/setup-java@v1
+      #  with:
+      #    java-version: 1.8
+
+      # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
+      #- name: Setup blackduck properties
+      #  run: |
+      #       PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
+      #       echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
+      #       echo detect.maven.included.scopes=compile >> application.properties
+
+      - name: Run blossom action
+        uses: NVIDIA/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+        with:
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  Post-processing:
+    name: Post processing
+    runs-on: blossom
+    if : github.event_name == 'workflow_dispatch'
+    steps:
+      - name: Start post processing
+        run: blossom-ci
+        env:
+          OPERATION: 'POST-PROCESSING'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -6,7 +6,7 @@ pipeline {
         }
   }
   options {
-    timeout(time: 1, unit: 'HOURS')
+    timeout(time: 2, unit: 'HOURS')
     disableConcurrentBuilds()
   }
   stages {

diff --git a/examples/asr/asr_webapp/Dockerfile b/examples/asr/asr_webapp/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/nemo:1.0.0rc1
+ARG BASE_IMAGE=nvcr.io/nvidia/nemo:1.0.1
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development

diff --git a/examples/nlp/machine_translation/enc_dec_nmt.py b/examples/nlp/machine_translation/enc_dec_nmt.py
@@ -111,7 +111,7 @@ def main(cfg: MTEncDecConfig) -> None:
     # training is managed by PyTorch Lightning
     trainer_cfg = OmegaConf.to_container(cfg.trainer)
     trainer_cfg.pop('plugins', None)
-    trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)
+    trainer = Trainer(plugins=[NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)], **trainer_cfg)
 
     # tokenizers will be trained and and tarred training data will be created if needed
     # model config is then updated

diff --git a/examples/speaker_recognition/extract_speaker_embeddings.py b/examples/speaker_recognition/extract_speaker_embeddings.py
@@ -89,7 +89,7 @@ def main():
             labels=None,
             batch_size=1,
             shuffle=False,
-            time_length=8,
+            time_length=20,
             embedding_dir=args.embedding_dir,
         )
     )

diff --git a/examples/speaker_recognition/voxceleb_eval.py b/examples/speaker_recognition/voxceleb_eval.py
@@ -70,7 +70,7 @@ def get_acc(trial_file='', emb='', save_kaldi_emb=False):
                 keys.append(y_speaker)
                 trial_embs.extend([Y])
 
-            score = (X @ Y.T) / (((X @ X.T) * (Y @ Y.T)) ** 0.5)
+            score = np.dot(X, Y) / ((np.dot(X, X) * np.dot(Y, Y)) ** 0.5)
             score = (score + 1) / 2
 
             all_scores.append(score)

diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
@@ -127,6 +127,13 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         )
         results.append(model)
 
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo",
+        )
+        results.append(model)
+
         model = PretrainedModelInfo(
             pretrained_model_name="asr_talknet_aligner",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:asr_talknet_aligner",

diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
@@ -29,6 +29,7 @@
 from nemo.collections.asr.models.asr_model import ExportableEncDecModel
 from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
 from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.asr.parts.utils.speaker_utils import embedding_normalize
 from nemo.collections.common.losses import CrossEntropyLoss as CELoss
 from nemo.collections.common.metrics import TopKClassificationAccuracy
 from nemo.core.classes import ModelPT
@@ -381,6 +382,7 @@ def test_epoch_end(self, outputs):
         slices = torch.cat([x['slices'] for x in outputs])
         emb_shape = embs.shape[-1]
         embs = embs.view(-1, emb_shape).cpu().numpy()
+        embs = embedding_normalize(embs)
         out_embeddings = {}
         start_idx = 0
         with open(self.test_manifest, 'r') as manifest:

diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -434,7 +434,7 @@ def input_types(self):
         """
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
-            "length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "length": NeuralType(tuple('B'), LengthsType()),
         }
 
     @property
@@ -474,7 +474,7 @@ def __init__(
                 mask_value=mask_value,
             )
         else:
-            self.spec_augment = lambda input_spec: input_spec
+            self.spec_augment = lambda input_spec, length: input_spec
 
         # Check if numba is supported, and use a Numba kernel if it is
         if use_numba_spec_augment and numba_utils.numba_cuda_is_supported(__NUMBA_MINIMUM_VERSION__):
@@ -490,15 +490,15 @@ def __init__(
             self.spec_augment_numba = None
 
     @typecheck()
-    def forward(self, input_spec, length=None):
+    def forward(self, input_spec, length):
         augmented_spec = self.spec_cutout(input_spec=input_spec)
 
         # To run the Numba kernel, correct numba version is required as well as
         # tensor must be on GPU and length must be provided
         if self.spec_augment_numba is not None and spec_augment_launch_heuristics(augmented_spec, length):
             augmented_spec = self.spec_augment_numba(input_spec=augmented_spec, length=length)
         else:
-            augmented_spec = self.spec_augment(input_spec=augmented_spec)
+            augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length)
         return augmented_spec
 
 

diff --git a/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py b/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py
@@ -251,11 +251,6 @@ def forward(self, input_spec, length):
         sh = input_spec.shape
         bs = sh[0]
 
-        if self.adaptive_temporal_width:
-            time_width = max(1, int(sh[2] * self.time_width))
-        else:
-            time_width = self.time_width
-
         # Construct the freq and time masks as well as start positions
         if self.freq_masks > 0:
             freq_starts = torch.randint(
@@ -267,10 +262,30 @@ def forward(self, input_spec, length):
             freq_lengths = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device)
 
         if self.time_masks > 0:
-            time_starts = torch.randint(
-                0, sh[2] - time_width + 1, size=[bs, self.time_masks], device=input_spec.device
-            )
-            time_lengths = torch.randint(0, time_width + 1, size=[bs, self.time_masks], device=input_spec.device)
+            if self.adaptive_temporal_width:
+                time_width = (length * self.time_width).int().clamp(min=1)
+            else:
+                time_width = (
+                    torch.tensor(self.time_width, dtype=torch.int32, device=input_spec.device)
+                    .unsqueeze(0)
+                    .repeat(sh[0])
+                )
+
+            time_starts = []
+            time_lengths = []
+            for idx in range(sh[0]):
+                time_starts.append(
+                    torch.randint(
+                        0, max(1, length[idx] - time_width[idx]), size=[1, self.time_masks], device=input_spec.device
+                    )
+                )
+                time_lengths.append(
+                    torch.randint(0, time_width[idx] + 1, size=[1, self.time_masks], device=input_spec.device)
+                )
+
+            time_starts = torch.cat(time_lengths, 0)
+            time_lengths = torch.cat(time_lengths, 0)
+
         else:
             time_starts = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device)
             time_lengths = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device)

diff --git a/nemo/collections/asr/parts/submodules/spectr_augment.py b/nemo/collections/asr/parts/submodules/spectr_augment.py
@@ -18,7 +18,7 @@
 import torch.nn as nn
 
 from nemo.core.classes import Typing, typecheck
-from nemo.core.neural_types import NeuralType, SpectrogramType
+from nemo.core.neural_types import LengthsType, NeuralType, SpectrogramType
 
 
 class SpecAugment(nn.Module, Typing):
@@ -43,7 +43,10 @@ class SpecAugment(nn.Module, Typing):
     def input_types(self):
         """Returns definitions of module input types
         """
-        return {"input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
+        return {
+            "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
+            "length": NeuralType(tuple('B'), LengthsType()),
+        }
 
     @property
     def output_types(self):
@@ -54,7 +57,7 @@ def output_types(self):
     def __init__(
         self, freq_masks=0, time_masks=0, freq_width=10, time_width=10, rng=None, mask_value=0.0,
     ):
-        super(SpecAugment, self).__init__()
+        super().__init__()
 
         self._rng = random.Random() if rng is None else rng
 
@@ -76,14 +79,9 @@ def __init__(
 
     @typecheck()
     @torch.no_grad()
-    def forward(self, input_spec):
+    def forward(self, input_spec, length):
         sh = input_spec.shape
 
-        if self.adaptive_temporal_width:
-            time_width = max(1, int(sh[2] * self.time_width))
-        else:
-            time_width = self.time_width
-
         for idx in range(sh[0]):
             for i in range(self.freq_masks):
                 x_left = self._rng.randint(0, sh[1] - self.freq_width)
@@ -93,7 +91,12 @@ def forward(self, input_spec):
                 input_spec[idx, x_left : x_left + w, :] = self.mask_value
 
             for i in range(self.time_masks):
-                y_left = self._rng.randint(0, sh[2] - time_width)
+                if self.adaptive_temporal_width:
+                    time_width = max(1, int(length[idx] * self.time_width))
+                else:
+                    time_width = self.time_width
+
+                y_left = self._rng.randint(0, max(1, length[idx] - time_width))
 
                 w = self._rng.randint(0, time_width)
 

diff --git a/nemo/collections/asr/parts/utils/speaker_utils.py b/nemo/collections/asr/parts/utils/speaker_utils.py
@@ -398,3 +398,20 @@ def write_rttm2manifest(paths2audio_files, paths2rttm_files, manifest_file):
             outfile.write("\n")
             f.close()
     return manifest_file
+
+
+def embedding_normalize(embs, use_std=False, eps=1e-10):
+    """
+    mean and l2 length normalize the input speaker embeddings
+    input:
+        embs: embeddings of shape (Batch,emb_size)
+    output:
+        embs: normalized embeddings of shape (Batch,emb_size)
+    """
+    embs = embs - embs.mean(axis=0)
+    if use_std:
+        embs = embs / (embs.std(axis=0) + eps)
+    embs_l2_norm = np.expand_dims(np.linalg.norm(embs, ord=2, axis=-1), axis=1)
+    embs = embs / embs_l2_norm
+
+    return embs
diff --git a/nemo/collections/common/tokenizers/bytelevel_tokenizers.py b/nemo/collections/common/tokenizers/bytelevel_tokenizers.py
@@ -15,6 +15,7 @@
 import re
 from pathlib import Path
 from typing import List
+
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 __all__ = ['ByteLevelProcessor', 'ByteLevelTokenizer']

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -184,7 +184,7 @@ def distributed_sampler_kwargs(self):
             return distributed_sampler_kwargs
 
         else:
-            return super().distributed_sampler_kwargs
+            return super(NLPDDPPlugin, self).distributed_sampler_kwargs
 
 
 class NLPCheckpointConnector(CheckpointConnector):

diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
@@ -107,8 +107,9 @@
 parser.add_argument(
     '--max_duration',
     default=None,
+    required=True,
     type=float,
-    help='Maximum duration of audio clip in the dataset. By default, it is None and will not filter files.',
+    help='Maximum duration of audio clip in the dataset. By default, it is None and is required to be set.',
 )
 parser.add_argument(
     '--min_duration',