NVIDIA · MaximumEntropy · Jul 6, 2022 · Mar 28, 2022 · Mar 28, 2022 · Mar 29, 2022
diff --git a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
@@ -56,7 +56,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -76,11 +76,13 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.
   normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer'
-  decoder_arch: 'transformer'
+  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
+  decoder_arch: 'transformer' # Options: ['transformer']
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   tokenizer:
     library: 'megatron'

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -57,7 +57,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -78,11 +78,13 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.
   normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer'
-  decoder_arch: 'transformer'
+  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
+  decoder_arch: 'transformer' # Options: ['transformer']
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   tokenizer:
     library: 'megatron'

diff --git a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
@@ -55,7 +55,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -75,11 +75,13 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.
   normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer'
-  decoder_arch: 'transformer'
+  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
+  decoder_arch: 'transformer' # Options: ['transformer']
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   tokenizer:
     library: 'megatron'

diff --git a/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml b/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml
@@ -66,7 +66,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -91,6 +91,8 @@ model:
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -120,6 +120,8 @@ def setup_optimizer_param_groups(self):
 
     def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder):
         # TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim)
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder_arch == 'perceiver':
+            raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.")
         if hasattr(self.cfg, 'bias_gelu_fusion'):
             logging.warning('bias_gelu_fusion is deprecated. Please use bias_activation_fusion instead.')
             activation_fusion = self.cfg.bias_gelu_fusion
@@ -163,6 +165,8 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
             normalization=self.cfg.get('normalization', 'layernorm'),
             transformer_block_type=self.cfg.get('transformer_block_type', 'pre_ln'),
             headscale=self.cfg.get('headscale', False),
+            hidden_steps=self.cfg.get('hidden_steps', -1),
+            num_self_attention_per_cross_attention=self.cfg.get('num_self_attention_per_cross_attention', 1),
             add_encoder=add_encoder,
             add_decoder=add_decoder,
         )

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py
@@ -76,7 +76,6 @@ def get_decoder_model(
     headscale=False,
     transformer_block_type="pre_ln",
     hidden_steps=-1,
-    hidden_blocks=1,
     parent_model_type=ModelType.encoder_or_decoder,
     layer_type=None,
     chunk_size=64,

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 """Transformer based language model."""
+import torch
 
+from nemo.collections.nlp.modules.common.megatron.megatron_perceiver_encoders import MegatronPerceiverEncoderModule
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 
@@ -41,15 +43,25 @@ def __init__(
         # AttnMaskType enum mask type (e.g., padding, casual)
         encoder_attn_mask_type: AttnMaskType = None,
         decoder_attn_mask_type: AttnMaskType = None,
+        hidden_steps: int = None,
     ):
         super(MegatronTransformerEncoderDecoderModule, self).__init__()
 
         self.encoder = encoder
         self.decoder = decoder
+        self.hidden_steps = hidden_steps
+        if isinstance(encoder, MegatronPerceiverEncoderModule) and hidden_steps is None:
+            raise ValueError(
+                f"hidden_steps cannot be None for perceiver encoders. It is needed to compute the encoder-decoder cross attention mask."
+            )
+
         # try to infer mask_type if not given
         if encoder_attn_mask_type is None:
             if encoder is None:
                 encoder_attn_mask_type = None
+            # Perceiver does not have a `.model` attribute, assume it always uses padding mask.
+            elif isinstance(encoder, MegatronPerceiverEncoderModule):
+                encoder_attn_mask_type = AttnMaskType.padding
             elif hasattr(encoder.model, 'self_attn_mask_type'):
                 encoder_attn_mask_type = encoder.model.self_attn_mask_type
             else:
@@ -136,6 +148,10 @@ def forward(
             return enc_output
 
         # decoder
+        # Adjust encoder attention mask if encoder is a perceiver.
+        if self.encoder is not None and isinstance(self.encoder, MegatronPerceiverEncoderModule):
+            enc_attn_mask = torch.ones(enc_output.size(0), self.hidden_steps).to(enc_output.device)
+
         dec_output = self.decode(
             dec_input=dec_input,
             dec_attn_mask=dec_attn_mask,

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Transformer based language model."""
+from nemo.collections.nlp.modules.common.megatron.megatron_perceiver_encoders import MegatronPerceiverEncoderModule
 from nemo.collections.nlp.modules.common.megatron.megatron_transformer_encoder import MegatronTransformerEncoderModule
 from nemo.collections.nlp.modules.common.megatron.retrieval_transformer import (
     MegatronRetrievalTransformerEncoderModule,
@@ -35,7 +36,7 @@
 
 __all__ = []
 
-AVAILABLE_ENCODERS = ["transformer"]
+AVAILABLE_ENCODERS = ["transformer", "perceiver", "retro"]
 
 
 def get_encoder_model(
@@ -74,11 +75,12 @@ def get_encoder_model(
     normalization="layernorm",
     headscale=False,
     transformer_block_type="pre_ln",
-    hidden_steps=-1,
+    hidden_steps=32,
     hidden_blocks=1,
     parent_model_type=ModelType.encoder_or_decoder,
     layer_type=None,
     chunk_size=64,
+    num_self_attention_per_cross_attention=1,
     layer_number_offset=0,  # this is use only for attention norm_factor scaling
 ):
     """Build language model and return along with the key to save."""
@@ -168,6 +170,45 @@ def get_encoder_model(
             chunk_size=chunk_size,
             layer_number_offset=layer_number_offset,
         )
+    elif arch == "perceiver":
+        encoder = MegatronPerceiverEncoderModule(
+            init_method=init_method,
+            output_layer_init_method=scaled_init_method,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            num_attention_heads=num_attention_heads,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            kv_channels=kv_channels,
+            ffn_hidden_size=ffn_hidden_size,
+            encoder_attn_mask_type=encoder_attn_mask_type,
+            pre_process=pre_process,
+            post_process=post_process,
+            use_cpu_initialization=use_cpu_initialization,
+            hidden_dropout=hidden_dropout,
+            attention_dropout=attention_dropout,
+            position_embedding_type=position_embedding_type,
+            relative_attention_num_buckets=relative_attention_num_buckets,
+            relative_attention_max_distance=relative_attention_max_distance,
+            precision=precision,
+            fp32_residual_connection=fp32_residual_connection,
+            activations_checkpoint_method=activations_checkpoint_method,
+            activations_checkpoint_num_layers=activations_checkpoint_num_layers,
+            layernorm_epsilon=layernorm_epsilon,
+            bias_activation_fusion=bias_activation_fusion,
+            bias_dropout_add_fusion=bias_dropout_add_fusion,
+            masked_softmax_fusion=masked_softmax_fusion,
+            persist_layer_norm=persist_layer_norm,
+            openai_gelu=openai_gelu,
+            onnx_safe=onnx_safe,
+            activation=activation,
+            bias=bias,
+            normalization=normalization,
+            transformer_block_type=transformer_block_type,
+            headscale=headscale,
+            parent_model_type=parent_model_type,
+            hidden_steps=hidden_steps,
+            num_self_attention_per_cross_attention=num_self_attention_per_cross_attention,
+        )
     else:
         raise ValueError(f"Unknown encoder arch = {arch}. Available encoder arch = {AVAILABLE_ENCODERS}")