IBM · tkornuta-ibm · Apr 27, 2019 · Apr 24, 2019 · Apr 24, 2019 · Apr 24, 2019
diff --git a/configs/default/components/models/attn_decoder_rnn.yml b/configs/default/components/models/attn_decoder_rnn.yml
@@ -0,0 +1,78 @@
+# This file defines the default values for the RNN model.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+# Size of the hidden state (LOADED)
+hidden_size: 100
+
+# Wether to include the last hidden state in the outputs
+output_last_state: False
+
+# Type of recurrent cell (LOADED)
+# -> Only GRU is supported
+
+# Number of "stacked" layers (LOADED)
+# -> Only a single layer is supported
+
+# Dropout rate (LOADED)
+# Default: 0 (means that it is turned off)
+dropout_rate: 0
+
+# Prediction mode (LOADED)
+# Options: 
+#   * Dense (passes every activation through output layer) |
+#   * Last (passes only the last activation though output layer) |
+#   * None (all outputs are discarded)
+prediction_mode: Dense
+
+# Enable FFN layer at the output of the RNN (before eventual feed back in the case of autoregression).
+# Useful if the raw outputs of the RNN are needed, for attention encoder-decoder for example.
+ffn_output: True
+
+# Length of generated output sequence (LOADED)
+# User must set it per task, as it is task specific.
+autoregression_length: 10
+
+# If true, output of the last layer will be additionally processed with Log Softmax (LOADED)
+use_logsoftmax: True
+
+streams: 
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing batch of encoder outputs (INPUT)
+  inputs: inputs
+
+  # Stream containing the inital state of the RNN (INPUT)
+  # The stream will be actually created only if `inital_state: Input`
+  input_state: input_state
+
+  # Stream containing predictions (OUTPUT)
+  predictions: predictions
+
+  # Stream containing the final output state of the RNN (output)
+  # The stream will be actually created only if `output_last_state: True`
+  output_state: output_state
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  # Size of the input (RETRIEVED)
+  input_size: input_size
+
+  # Size of the prediction (RETRIEVED)
+  prediction_size: prediction_size
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/default/components/problems/text_to_text/translation_pairs.yml b/configs/default/components/problems/text_to_text/translation_pairs.yml
@@ -0,0 +1,49 @@
+# This file defines the default values for the WikiText language modeling.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+# Folder where problem will store data (LOADED)
+data_folder: ~/data/language_modeling/translation_pairs
+
+# Defines the dataset that will be used used (LOADED)
+# Options: eng-fra, eng-pol
+dataset: eng-fra
+
+# Defines the used subset (LOADED)
+# Options: train | valid | test
+subset: train
+
+# Length limit of source and target sentence
+# if < 0, no limit
+sentence_length: 10
+
+streams: 
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing batch of indices (OUTPUT)
+  # Every problem MUST return that stream.
+  indices: indices
+
+  # Stream containing batch of tokenized source sentences (OUTPUT)
+  sources: sources
+
+  # Stream containing batch of tokenized target sentences (OUTPUT)
+  targets: targets
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/translation/eng_fra_translation_enc_attndec.yml b/configs/translation/eng_fra_translation_enc_attndec.yml
@@ -0,0 +1,172 @@
+# This pipeline applied an encoder-decoder GRU with attention on the open Tatoeba translation sentence pairs. 
+# Inspired by https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html .
+# Note that training will be slower than in the tutorial, as teacher forcing is not implemented here.
+
+# Training parameters:
+training:
+  problem:
+    type: &p_type TranslationPairs
+    data_folder: &data_folder ~/data/language_modeling/translation_pairs
+    dataset: &dataset eng-fra
+    subset: train
+    sentence_length: 10
+    batch_size:  64
+
+  # optimizer parameters:
+  optimizer:
+    name: Adam
+    lr: 1.0e-3
+
+  # settings parameters
+  terminal_conditions:
+    loss_stop: 1.0e-2
+    episode_limit: 1000000
+    epoch_limit: 100
+
+# Validation parameters:
+validation:
+  partial_validation_interval: 100
+  problem:
+    type: *p_type
+    data_folder: *data_folder
+    dataset: *dataset
+    subset: valid
+    sentence_length: 10
+    batch_size:  64
+
+# Testing parameters:
+testing:
+  problem:
+    type: *p_type 
+    data_folder: *data_folder
+    dataset: *dataset
+    subset: test
+    sentence_length: 10
+    batch_size: 64
+
+pipeline:
+  name: eng_fra_translation_enc_attndec
+
+  # Source encoding - model 1.
+  source_sentence_embedding:
+    type: SentenceEmbeddings
+    priority: 1.1
+    embeddings_size: 50
+    pretrained_embeddings: glove.6B.50d.txt
+    data_folder: *data_folder
+    source_vocabulary_files: eng-fra/eng.train.txt,eng-fra/eng.valid.txt,eng-fra/eng.test.txt
+    vocabulary_mappings_file: eng-fra/eng.all.tokenized_words
+    regenerate: True
+    additional_tokens: <eos>
+    import_word_mappings_from_globals: False
+    export_word_mappings_to_globals: False
+    fixed_padding: 10
+    streams:
+      inputs: sources
+      outputs: embedded_sources
+
+  # Target encoding.
+  target_indexer:
+    type: SentenceIndexer
+    priority: 2.1
+    data_folder: *data_folder
+    source_vocabulary_files: eng-fra/fra.train.txt,eng-fra/fra.valid.txt,eng-fra/fra.test.txt
+    import_word_mappings_from_globals: False
+    export_word_mappings_to_globals: True
+    fixed_padding: 10
+    regenerate: True
+    streams:
+      inputs: targets
+      outputs: indexed_targets
+
+  # Single layer GRU Encoder
+  encoder:
+    type: RecurrentNeuralNetwork
+    cell_type: GRU
+    priority: 3
+    initial_state: Trainable
+    hidden_size: 50
+    num_layers: 1
+    use_logsoftmax: False
+    output_last_state: True
+    prediction_mode: Dense
+    ffn_output: False
+    streams:
+      inputs: embedded_sources
+      predictions: s2s_encoder_output
+      output_state: s2s_state_output
+    globals:
+      input_size: embeddings_size
+      prediction_size: embeddings_size 
+
+  # Single layer GRU Decoder with attention
+  decoder:
+    type: Attn_Decoder_RNN
+    priority: 4
+    hidden_size: 50
+    use_logsoftmax: False
+    autoregression_length: 10
+    prediction_mode: Dense
+    streams:
+      inputs: s2s_encoder_output
+      predictions: s2s_decoder_output
+      input_state: s2s_state_output
+    globals:
+      input_size: embeddings_size
+      prediction_size: embeddings_size 
+
+  # FF, to resize the from the output size of the seq2seq to the size of the target vector
+  ff_resize_s2s_output:
+    type: FeedForwardNetwork 
+    use_logsoftmax: True
+    dimensions: 3
+    priority: 5
+    streams:
+      inputs: s2s_decoder_output
+    globals:
+      input_size: embeddings_size
+      prediction_size: vocabulary_size
+
+  # Loss
+  nllloss:
+    type: NLLLoss
+    priority: 6
+    num_targets_dims: 2
+    streams:
+      targets: indexed_targets
+      loss: loss
+
+  # Prediction decoding.
+  prediction_decoder:
+    type: SentenceIndexer
+    priority: 10
+    # Reverse mode.
+    reverse: True
+    # Use distributions as inputs.
+    use_input_distributions: True
+    data_folder: *data_folder
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: predictions
+      outputs: prediction_sentences
+
+
+  # Statistics.
+  batch_size:
+    type: BatchSizeStatistics
+    priority: 100.0
+
+  bleu:
+    type: BLEUStatistics
+    priority: 100.2
+    streams:
+      targets: indexed_targets
+
+
+  # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.3
+    input_streams: sources,targets,indexed_targets,prediction_sentences
+
+#: pipeline