NVIDIA · vsl9 · Jun 29, 2018 · Jun 29, 2018 · Jun 29, 2018 · Jun 29, 2018
diff --git a/docs/sources/source/api-docs/decoders.rst b/docs/sources/source/api-docs/decoders.rst
@@ -37,3 +37,11 @@ transformer\_decoders
     :members:
     :undoc-members:
     :show-inheritance:
+
+convs2s\_decoder
+-------------------------------------
+
+.. automodule:: decoders.convs2s_decoder
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/sources/source/api-docs/encoders.rst b/docs/sources/source/api-docs/encoders.rst
@@ -22,6 +22,14 @@ ds2\_encoder
     :undoc-members:
     :show-inheritance:
 
+w2l\_encoder
+----------------------------
+
+.. automodule:: encoders.w2l_encoder
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 rnn\_encoders
 -----------------------------
 
@@ -38,6 +46,14 @@ transformer\_encoders
     :undoc-members:
     :show-inheritance:
 
+convs2s\_encoder
+-------------------------------------
+
+.. automodule:: encoders.convs2s_encoder
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 resnet\_encoder
 ----------------------------------
 
@@ -53,3 +69,12 @@ resnet\_blocks
     :members:
     :undoc-members:
     :show-inheritance:
+
+
+cnn\_encoder
+--------------------------------
+
+.. automodule:: encoders.cnn_encoder
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/sources/source/api-docs/parts.cnns.rst b/docs/sources/source/api-docs/parts.cnns.rst
@@ -0,0 +1,15 @@
+cnns
+=======================================
+
+.. automodule:: parts.cnns
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+conv\_blocks
+-------------------------------------------------------
+
+.. automodule:: parts.cnns.conv_blocks
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/sources/source/api-docs/parts.convs2s.rst b/docs/sources/source/api-docs/parts.convs2s.rst
@@ -0,0 +1,31 @@
+convs2s
+=======================================
+
+.. automodule:: parts.convs2s
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+attention\_wn\_layer
+-------------------------------------------------------
+
+.. automodule:: parts.convs2s.attention_wn_layer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+conv\_wn\_layer
+-------------------------------------------------------
+
+.. automodule:: parts.convs2s.conv_wn_layer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+ffn\_wn\_layer
+-------------------------------------------------------
+
+.. automodule:: parts.convs2s.ffn_wn_layer
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/sources/source/api-docs/parts.rst b/docs/sources/source/api-docs/parts.rst
@@ -10,3 +10,5 @@ parts
 
     parts.rnns
     parts.transformer
+    parts.convs2s
+    parts.cnns
diff --git a/docs/sources/source/api-docs/parts.transformer.rst b/docs/sources/source/api-docs/parts.transformer.rst
@@ -22,14 +22,6 @@ beam\_search
     :undoc-members:
     :show-inheritance:
 
-beam\_search\_test
----------------------------------------------------------
-
-.. automodule:: parts.transformer.beam_search_test
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
 common
 ---------------------------------------------
 

diff --git a/docs/sources/source/installation-instructions.rst b/docs/sources/source/installation-instructions.rst
@@ -32,7 +32,7 @@ run unittests::
 
    python -m unittest discover -s open_seq2seq -p '*_test.py'
 
-It might take up to 10 minutes. You should see a lot of output, but no errors
+It might take up to 30 minutes. You should see a lot of output, but no errors
 in the end.
 
 .. _installation_speech:

diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst
@@ -3,22 +3,21 @@
 Models and recipes
 ==================
 
-.. This section will contain information about different models that OpenSeq2Seq
-.. supports, exact config parameters to train them, final training/validation/test
-.. metrics and links to checkpoints (tensorboards also?) of trained models.
 
 .. note::
     Currently OpenSeq2Seq has model implementations for machine translation and
-    automatic speech recognition. All models work both in float32 and mixed precision.
-    We recommend you use :ref:`mixed precision training <mixed_precision>` when training on Volta GPUs.
+    automatic speech recognition.
+    All models work both in float32 and mixed precision.
+    We recommend you use :ref:`mixed precision training <mixed_precision>`
+    when training on Volta GPUs.
 
 
-To train models you can use the following
-commands (don't forget to substitute valid config_file path there).
+To train models you can use the following commands (don't forget to substitute
+valid config_file path there and number of GPUs if using Horovod).
 
 With Horovod (highly recommended when using multiple GPUs)::
 
-    mpirun --allow-run-as-root --mca orte_base_help_aggregate 0 -mca btl ^openib -np 4 -H localhost:4 -bind-to none -map-by slot -x LD_LIBRARY_PATH python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs
+    mpiexec --allow-run-as-root -np <num_gpus> python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs
 
 Without Horovod::
 
@@ -29,6 +28,16 @@ The description of implemented models is available in the next sections:
 Machine translation
 -------------------
 
+The table below contains description and results of
+machine translation models available in OpenSeq2Seq.
+Currently, we have GNMT-based model, Transformer-based models and
+ConvS2S-based models.
+
+We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses.
+For more details about model descriptions and training setup,
+have a look at the `configuration files <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de>`_.
+
+
 .. list-table::
    :widths: 1 1 1 1 1
    :header-rows: 1
@@ -38,72 +47,87 @@ Machine translation
      - Training setup and additional comments
      - Short description of the model
      - Checkpoint
-   * - `en-de-nmt-small.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de-nmt-small.py>`_
+   * - `en-de-nmt-small.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de/en-de-nmt-small.py>`_
      - 20.23
      - This model should train on a single GPU such as 1080Ti. It is trained using Adam optimizer.
      - RNN-based. Bi-directional encoder with 2 layers and. GNMT-like decoder with 2 layers and attention. Uses LSTM cells of size 512.
      - `link <https://drive.google.com/file/d/1Ty9hiOQx4V28jJmIbj7FWUyw7LVA39SF/view?usp=sharing>`_
-   * - `en-de-gnmt-like-4GPUs.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de-gnmt-like-4GPUs.py>`_
+   * - `en-de-gnmt-like-4GPUs.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py>`_
      - 23.89
      - This model was trained on 4 GPUs with Adam optimizer and learning rate decay.
      - RNN-based. This is GNMT-like model which tries to match the one described in https://arxiv.org/abs/1609.08144 as close as possible.
      - `link <https://drive.google.com/file/d/1HVc4S8-wv1-AZK1JeWgn6YNITSFAMes_/view?usp=sharing>`_
-   * - `transformer-big.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/transformer-big.py>`_
+   * - `transformer-big.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de/transformer-big.py>`_
      - 26.17
      - This model was trained on 4 GPUs with Adam optimizer and learning rate decay.
      - Transformer "big" model. This model does not have any RNN layers
      - `link <https://drive.google.com/file/d/151R6iCCtehRLpnH3nBmhEi_nhNO2mXW8/view?usp=sharing>`_
+   * - `en-de-convs2s.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/text2text/en-de/en-de-convs2s.py>`_
+     - xx.xx
+     - This model was trained on 4 GPUs with Adam optimizer, learning rate decay and warm-up.
+     - This is an implementation of the ConvS2S model proposed in https://arxiv.org/abs/1705.03122.
+     - Coming soon.
 
-GNMT model description can be found `here <https://arxiv.org/abs/1609.08144>`_.
-Transformer model description can be found `here <https://arxiv.org/abs/1706.03762>`_.
-We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses.
+GNMT model description: https://arxiv.org/abs/1609.08144.
+
+Transformer model description: https://arxiv.org/abs/1706.03762.
+
+ConvS2S model description: https://arxiv.org/abs/1705.03122.
 
 Speech recognition
 ------------------
 
-Deep Speech 2 based models
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595.
 The table below contains description and results of
-Deep Speech 2 based models available in OpenSeq2Seq.
+speech recognition models available in OpenSeq2Seq.
+Currently, we have DeepSpeech2-based models and Wav2Letter-based models.
 
-WER-512 and WER-2048 is word error rate obtained with beam width of 512 and 2048
-correspondingly. For beam width of 2048 we also used ``batch_size_per_gpu = 1``
+WER is the word error rate obtained on a dev-clean subset of LibriSpeech using
+greedy decoder (``decoder_params/use_language_model = False``).
+For the final evaluation we used ``batch_size_per_gpu = 1``
 to eliminate the effect of `cudnn padding issue <https://github.com/NVIDIA/OpenSeq2Seq/issues/69>`_.
 For more details about model descriptions and training setup,
-have a look at the `configuration files <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/speech2text/>`_.
+have a look at the `configuration files <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/speech2text>`_.
 
 .. list-table::
-   :widths: 1 1 1 1 1 1
+   :widths: 1 1 1 1 1
    :header-rows: 1
 
    * - Config file
-     - WER-512
-     - WER-2048
+     - WER
      - Training setup and additional comments
      - Short description of the model
      - Checkpoint
    * - `ds2_large_8gpus.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/speech2text/ds2_large_8gpus.py>`_
-     - 4.90%
-     - 4.59%
+     - 14.89%
      - This model was trained for 50 epochs using SGD with Momentum and LARC on
        the full LibriSpeech in a few days using Horovod on eight GPUs.
      - This model has 2 convolutional layers and 5 bidirectional
        GRU layers with 800 units.
      - `link <https://drive.google.com/file/d/1gfGg3DzXviNhYlIyxl12gWp47R8Uz-Bf/view?usp=sharing>`_
    * - `ds2_medium_4gpus.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/speech2text/ds2_medium_4gpus.py>`_
-     - 6.12%
-     - 5.49%
+     - 22.60%
      - This model was trained for 50 epochs using Adam on the full
        LibriSpeech in a few days using Horovod on four GPUs.
      - This model has 3 convolutional layers and 3 unidirectional
        GRU layers with 1024 units.
      - `link <https://drive.google.com/file/d/1XpnyZzMaO38RE4dSOJZkcaJ3T8B0lxKe/view?usp=sharing>`_
    * - `ds2_small_1gpu.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/speech2text/ds2_small_1gpu.py>`_
-     - 11.77%
-     - 9.32%
+     - 39.08%
      - This model was trained for 12 epochs using Adam on a "clean" subset of
        LibriSpeech in less than a day using a single GPU.
      - This model has 2 convolutional layers and 2 bidirectional
        GRU layers with 512 units.
      - `link <https://drive.google.com/file/d/1-OEvxyg7rCogZhejen7pNuKkgvuwCdbk/view?usp=sharing>`_
+   * - `w2l_large_8gpus.py <https://github.com/NVIDIA/OpenSeq2Seq/blob/master/example_configs/speech2text/w2l_large_8gpus.py>`_
+     - 15.38%
+     - This model was trained for 18 epochs (with early stopping based on
+       validation loss) using SGD with Momentum and LARC on
+       the full LibriSpeech in a few days on eight GPUs.
+     - The model has 19 convolutional layers (200--1000 units, 7--21 kernel size).
+       We use batch norm between all layers.
+     - Coming soon.
+
+
+Deep Speech 2 model description: https://arxiv.org/abs/1512.02595.
+
+Wav2Letter model description: https://arxiv.org/abs/1609.03193, https://arxiv.org/abs/1712.09444.
diff --git a/open_seq2seq/models/speech2text_ds2_test.py b/open_seq2seq/models/speech2text_ds2_test.py
@@ -1,21 +1,13 @@
 # Copyright (c) 2017 NVIDIA Corporation
 from __future__ import absolute_import, division, print_function
 from __future__ import unicode_literals
-from six.moves import range
 
 import tensorflow as tf
-import numpy as np
-import copy
-import numpy.testing as npt
-import tempfile
-import os
-import pandas as pd
+
+from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import \
+  base_params, train_params, eval_params, base_model
 
 from .speech2text_test import Speech2TextModelTests
-from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import base_params, \
-                                                                        train_params, \
-                                                                        eval_params, \
-                                                                        base_model
 
 
 class DS2ModelTests(Speech2TextModelTests):

diff --git a/open_seq2seq/models/speech2text_w2l_test.py b/open_seq2seq/models/speech2text_w2l_test.py
@@ -1,24 +1,15 @@
 # Copyright (c) 2017 NVIDIA Corporation
 from __future__ import absolute_import, division, print_function
 from __future__ import unicode_literals
-from six.moves import range
 
 import tensorflow as tf
-import numpy as np
-import copy
-import numpy.testing as npt
-import tempfile
-import os
-import pandas as pd
 
+from open_seq2seq.test_utils.test_speech_configs.w2l_test_config import \
+  base_params, train_params, eval_params, base_model
 from .speech2text_test import Speech2TextModelTests
-from open_seq2seq.test_utils.test_speech_configs.w2l_test_config import base_params, \
-                                                                        train_params, \
-                                                                        eval_params, \
-                                                                        base_model
 
 
-class DS2ModelTests(Speech2TextModelTests):
+class W2LModelTests(Speech2TextModelTests):
 
   def setUp(self):
     self.base_model = base_model
@@ -32,12 +23,6 @@ def tearDown(self):
   def test_convergence(self):
     return self.convergence_test(5.0, 30.0, 0.1)
 
-  def test_convergence_with_iter_size(self):
-    return self.convergence_with_iter_size_test()
-
-  def test_infer(self):
-    return self.infer_test()
-
   def test_mp_collection(self):
     return self.mp_collection_test(14, 6)
 

diff --git a/open_seq2seq/parts/cnns/conv_blocks.py b/open_seq2seq/parts/cnns/conv_blocks.py
@@ -11,6 +11,7 @@
 def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides,
               padding, regularizer, training, data_format):
   """Helper function that applies convolution and activation.
+
     Args:
       type: the following types are supported
         'conv1d', 'conv2d'
@@ -38,11 +39,12 @@ def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides,
   return output
 
 
-def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, strides,
-                 padding, regularizer, training, data_format, bn_momentum,
-                 bn_epsilon):
+def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn,
+                 strides, padding, regularizer, training, data_format,
+                 bn_momentum, bn_epsilon):
   """Helper function that applies convolution, batch norm and activation.
     Accepts inputs in 'channels_last' format only.
+
     Args:
       type: the following types are supported
         'conv1d', 'conv2d'

diff --git a/run.py b/run.py
@@ -69,10 +69,10 @@ def main():
   # with command line arguments that were passed to the script
   parser_unk = argparse.ArgumentParser()
   for pm, value in flatten_dict(base_config).items():
-    if isinstance(value, int) or isinstance(value, float) or \
+    if type(value) == int or type(value) == float or \
        isinstance(value, string_types):
       parser_unk.add_argument('--' + pm, default=value, type=type(value))
-    elif isinstance(value, bool):
+    elif type(value) == bool:
       parser_unk.add_argument('--' + pm, default=value, type=ast.literal_eval)
   config_update = parser_unk.parse_args(unknown)
   nested_update(base_config, nest_dict(vars(config_update)))