Test new data b128.

NVIDIA · Aug 14, 2018 · cca1551 · cca1551
1 parent 381315a
commit cca1551
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 13 deletions.
diff --git a/example_configs/text2text/en-de/en-de-convs2s_dev.py b/example_configs/text2text/en-de/en-de-convs2s_dev.py
@@ -33,13 +33,13 @@
 
 iter_size = 1
 dtype = "mixed" #tf.float32 #tf.float32 #  #
-shuffle_train = False
+shuffle_train = True
 use_horovod = True
 
 max_steps = int((4500000 / (num_gpus * batch_size * iter_size)) * epoch_num)
 
-conv_act = None #tf.nn.relu tf.nn.tanh gated_linear_units
-normalization_type = "layer_norm"  #weight_norm or "batch_norm" or None
+conv_act = gated_linear_units #tf.nn.relu tf.nn.tanh gated_linear_units
+normalization_type = "weight_norm"  #weight_norm or "batch_norm" or None
 scaling_factor = math.sqrt(0.5) #changed here
 inti_var = None
 
@@ -88,7 +88,7 @@
     #"conv_nchannels_kwidth": [(512, 3)]*10 + [(768, 3)]*3 + [(2048, 1)]*2,
 
     # fairseq config
-    "conv_nchannels_kwidth": [(512*2, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2,
+    "conv_nchannels_kwidth": [(512, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2,
 
     "embedding_dropout_keep_prob": 0.8,
     "hidden_dropout_keep_prob": 0.8,
@@ -117,7 +117,7 @@
     #"conv_nchannels_kwidth": [(512, 3)]*10 + [(768, 3)]*3 + [(2048, 1)]*2,
 
     # fairseq config
-    "conv_nchannels_kwidth": [(512*2, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2,
+    "conv_nchannels_kwidth": [(512, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2,
 
     "embedding_dropout_keep_prob": 0.8,
     "hidden_dropout_keep_prob": 0.8,

diff --git a/example_configs/text2text/en-de/en-de-convs2s_plus.py b/example_configs/text2text/en-de/en-de-convs2s_plus.py
@@ -33,16 +33,16 @@
 
 iter_size = 1
 dtype = "mixed" #tf.float32 #tf.float32 #  #
-shuffle_train = True
+shuffle_train = False
 use_horovod = True
 
 max_steps = int((4500000 / (num_gpus * batch_size * iter_size)) * epoch_num)
 
 conv_act = None #tf.nn.relu tf.nn.tanh gated_linear_units
-normalization_type = "layer_norm"  #weight_norm or "batch_norm" or None
+normalization_type = "batch_norm"  #weight_norm or "batch_norm" or None
 scaling_factor = 1.0 #math.sqrt(0.5) #changed here
 
-inti_var = None #1e-3
+inti_var = 1e-3
 
 base_params = {
   # iter_size can be used just with horovod

diff --git a/open_seq2seq/decoders/convs2s_decoder.py b/open_seq2seq/decoders/convs2s_decoder.py
@@ -198,6 +198,8 @@ def _decode(self, input_dict):
               layer_id=i + 1,
               add_res=True,
               mode=self.mode,
+              normalization_type=self.normalization_type,
+              scaling_factor=self.scaling_factor,
               regularizer=self.regularizer,
               init_var=self.init_var
           )

diff --git a/open_seq2seq/decoders/convs2s_decoder2.py b/open_seq2seq/decoders/convs2s_decoder2.py
@@ -183,7 +183,7 @@ def _decode(self, input_dict):
               hidden_dropout=self.params["hidden_dropout_keep_prob"],
               conv_padding="VALID",
               decode_padding=True,
-              activation=tf.nn.relu, #changed here
+              activation=self.conv_activation, #changed here
               normalization_type=self.normalization_type,
               regularizer=self.regularizer,
               init_var=self.init_var)
@@ -194,8 +194,8 @@ def _decode(self, input_dict):
               layer_id=i + 1,
               add_res=True,
               mode=self.mode,
-              normalization_type=self.normalization_type,
               scaling_factor=self.scaling_factor,
+              normalization_type=self.normalization_type,
               regularizer=self.regularizer,
               init_var=self.init_var)
 
@@ -321,7 +321,8 @@ def _call(self, decoder_inputs, encoder_outputs_a, encoder_outputs_b,
         outputs = (outputs + res_inputs) * self.scaling_factor
 
         # changed here
-        outputs = tf.nn.relu(outputs) #self.conv_activation(outputs)
+        if i < len(self.layers) - 2:
+          outputs = tf.nn.relu(outputs) #self.conv_activation(outputs)
 
 
     with tf.variable_scope("linear_layer_after_cnn_layers"):

diff --git a/open_seq2seq/parts/convs2s/attention_wn_layer.py b/open_seq2seq/parts/convs2s/attention_wn_layer.py
@@ -15,8 +15,8 @@ class AttentionLayerNormalized(tf.layers.Layer):
   """Attention layer for convs2s with weight normalization"""
 
   def __init__(self, in_dim, embed_size, layer_id, add_res, mode,
-               normalization_type="weight_norm",
                scaling_factor=math.sqrt(0.5),
+               normalization_type="weight_norm",
                regularizer=None,
                init_var=None,
                ):

diff --git a/open_seq2seq/parts/convs2s/conv_wn_layer.py b/open_seq2seq/parts/convs2s/conv_wn_layer.py
@@ -26,7 +26,7 @@ def __init__(self,
                decode_padding,
                activation=gated_linear_units,
                normalization_type="weight_norm",
-               regularizer=None, #tf.contrib.layers.l2_regularizer(scale=1e-4)
+               regularizer=None, # tf.contrib.layers.l2_regularizer(scale=1e-4)
                init_var=None,
                ):
     """initializes the 1D convolution layer.