diff --git a/.gitignore b/.gitignore
index 894a44c..7696fb1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# vscode
+.vscode/
\ No newline at end of file
diff --git a/configs/default/components/models/recurrent_neural_network.yml b/configs/default/components/models/recurrent_neural_network.yml
index a0e6f5e..6767753 100644
--- a/configs/default/components/models/recurrent_neural_network.yml
+++ b/configs/default/components/models/recurrent_neural_network.yml
@@ -37,6 +37,10 @@ dropout_rate: 0
 #   * None (all outputs are discarded)
 prediction_mode: Dense
 
+# Enable FFN layer at the output of the RNN (before eventual feed back in the case of autoregression).
+# Useful if the raw outputs of the RNN are needed, for attention encoder-decoder for example.
+ffn_output: True
+
 # Input mode
 # Options:
 #   * Dense (every iteration expects an input)
diff --git a/configs/wikitext/wikitext_language_modeling_seq2seq.yml b/configs/wikitext/wikitext_language_modeling_seq2seq.yml
index 84bbeaf..aa531fd 100644
--- a/configs/wikitext/wikitext_language_modeling_seq2seq.yml
+++ b/configs/wikitext/wikitext_language_modeling_seq2seq.yml
@@ -73,28 +73,6 @@ pipeline:
     streams:
       inputs: targets
       outputs: indexed_targets
-
-  # Publish the hidden size of the seq2seq
-  global_publisher:
-    type: GlobalVariablePublisher
-    priority: 1
-    # Add input_size to globals, so classifier will use it.
-    keys: s2s_hidden_size
-    values: 300
-
-  # FF, to resize the embeddings to whatever the hidden size of te seq2seq is.
-  ff_resize_s2s_input:
-    type: FeedForwardNetwork 
-    priority: 2.5
-    s2s_hidden_size: 300
-    use_logsoftmax: False
-    dimensions: 3
-    streams:
-      inputs: embedded_sources
-      predictions: embedded_sources_resized
-    globals:
-      input_size: embeddings_size
-      prediction_size: s2s_hidden_size
   
   # LSTM Encoder
   lstm_encoder:
@@ -107,12 +85,12 @@ pipeline:
     output_last_state: True
     prediction_mode: Last
     streams:
-      inputs: embedded_sources_resized
+      inputs: embedded_sources
       predictions: s2s_encoder_output
       output_state: s2s_state_output
     globals:
-      input_size: s2s_hidden_size
-      prediction_size: s2s_hidden_size 
+      input_size: embeddings_size
+      prediction_size: embeddings_size 
 
   # LSTM Decoder
   lstm_decoder:
@@ -130,10 +108,10 @@ pipeline:
       predictions: s2s_decoder_output
       input_state: s2s_state_output
     globals:
-      input_size: s2s_hidden_size
-      prediction_size: s2s_hidden_size 
+      input_size: embeddings_size
+      prediction_size: embeddings_size 
 
-  # FF, to resize the from the hidden size of the seq2seq to the size of the target vector
+  # FF, to resize the from the output size of the seq2seq to the size of the target vector
   ff_resize_s2s_output:
     type: FeedForwardNetwork 
     use_logsoftmax: True
@@ -142,7 +120,7 @@ pipeline:
     streams:
       inputs: s2s_decoder_output
     globals:
-      input_size: s2s_hidden_size
+      input_size: embeddings_size
       prediction_size: vocabulary_size
 
   # Loss
diff --git a/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml b/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml
index 731d590..fd489db 100644
--- a/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml
+++ b/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml
@@ -73,28 +73,6 @@ pipeline:
     streams:
       inputs: targets
       outputs: indexed_targets
-
-  # Publish the hidden size of the seq2seq
-  global_publisher:
-    type: GlobalVariablePublisher
-    priority: 1
-    # Add input_size to globals, so classifier will use it.
-    keys: s2s_hidden_size
-    values: 300
-
-  # FF, to resize the embeddings to whatever the hidden size of te seq2seq is.
-  ff_resize_s2s_input:
-    type: FeedForwardNetwork 
-    priority: 2.5
-    s2s_hidden_size: 300
-    use_logsoftmax: False
-    dimensions: 3
-    streams:
-      inputs: embedded_sources
-      predictions: embedded_sources_resized
-    globals:
-      input_size: embeddings_size
-      prediction_size: s2s_hidden_size
   
   # LSTM seq2seq
   lstm_encoder:
@@ -105,11 +83,11 @@ pipeline:
     num_layers: 3
     use_logsoftmax: False
     streams:
-      inputs: embedded_sources_resized
+      inputs: embedded_sources
       predictions: s2s_output
     globals:
-      input_size: s2s_hidden_size
-      prediction_size: s2s_hidden_size 
+      input_size: embeddings_size
+      prediction_size: embeddings_size 
 
   # FF, to resize the from the hidden size of the seq2seq to the size of the target vector
   ff_resize_s2s_output:
@@ -120,7 +98,7 @@ pipeline:
     streams:
       inputs: s2s_output
     globals:
-      input_size: s2s_hidden_size
+      input_size: embeddings_size
       prediction_size: vocabulary_size
 
   # Loss
diff --git a/ptp/components/models/recurrent_neural_network.py b/ptp/components/models/recurrent_neural_network.py
index 75a7bd4..041a8fa 100644
--- a/ptp/components/models/recurrent_neural_network.py
+++ b/ptp/components/models/recurrent_neural_network.py
@@ -38,6 +38,7 @@ def __init__(self, name, config):
         # Get input/output mode
         self.input_mode = self.config["input_mode"]
         self.output_last_state = self.config["output_last_state"]
+        self.ffn_output = self.config["ffn_output"]
 
         # Get prediction mode from configuration.
         self.prediction_mode = self.config["prediction_mode"]
@@ -68,6 +69,9 @@ def __init__(self, name, config):
                 self.prediction_size = self.prediction_size[0]
             else:
                 raise ConfigurationError("RNN prediction size '{}' must be a single dimension (current {})".format(self.key_prediction_size, self.prediction_size))
+
+        if "Autoregression" in self.input_mode:
+            assert self.input_size == self.prediction_size, "In autoregression mode, needs input_size == prediction_size."
         
         # Retrieve hidden size from configuration.
         self.hidden_size = self.config["hidden_size"]
@@ -134,7 +138,9 @@ def __init__(self, name, config):
         self.logger.info("Initializing RNN with input size = {}, hidden size = {} and prediction size = {}".format(self.input_size, self.hidden_size, self.prediction_size))
 
         # Create the output layer.
-        self.activation2output = torch.nn.Linear(self.hidden_size, self.prediction_size)
+        self.activation2output_lin = None
+        if(self.ffn_output):
+            self.activation2output_lin = torch.nn.Linear(self.hidden_size, self.prediction_size)
         
         # Create the final non-linearity.
         self.use_logsoftmax = self.config["use_logsoftmax"]
@@ -157,6 +163,25 @@ def initialize_hiddens_state(self, batch_size):
             # Return hidden_state.
             return self.init_hidden.expand(self.num_layers, batch_size, self.hidden_size).contiguous()
 
+    def activation2output(self, activations):
+        output = self.dropout(activations)
+
+        if(self.ffn_output):
+            #output = activations.squeeze(1)
+            shape = activations.shape
+
+            # Reshape to 2D tensor [BATCH_SIZE * SEQ_LEN x HIDDEN_SIZE]
+            output = output.contiguous().view(-1, shape[2])
+
+            # Propagate data through the output layer [BATCH_SIZE * SEQ_LEN x PREDICTION_SIZE]
+            output = self.activation2output_lin(output)
+            #output = output.unsqueeze(1)
+
+            # Reshape back to 3D tensor [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE]
+            output = output.view(shape[0], shape[1], output.size(1))
+
+        return output
+
 
     def input_data_definitions(self):
         """ 
@@ -173,7 +198,10 @@ def input_data_definitions(self):
 
         # Input hidden state
         if self.initial_state == "Input":
-            d[self.key_input_state] = DataDefinition([-1, 2 if self.cell_type == 'LSTM' else 1, self.input_size, 1, self.hidden_size], [torch.tensor], "Batch of RNN last states")
+            if self.cell_type == "LSTM":
+                d[self.key_input_state] = DataDefinition([2, self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states")
+            else:
+                d[self.key_input_state] = DataDefinition([self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states")
 
         return d
 
@@ -193,8 +221,11 @@ def output_data_definitions(self):
 
         # Output hidden state stream
         if self.output_last_state:
-            d[self.key_output_state] = DataDefinition([-1, 2 if self.cell_type == 'LSTM' else 1, self.input_size, 1, self.hidden_size], [torch.tensor], "Batch of RNN last states")
-        
+            if self.cell_type == "LSTM":
+                d[self.key_output_state] = DataDefinition([2, self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states")
+            else:
+                d[self.key_output_state] = DataDefinition([self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states")
+
         return d
 
     def forward(self, data_dict):
@@ -213,10 +244,9 @@ def forward(self, data_dict):
         # Get inputs [BATCH_SIZE x SEQ_LEN x INPUT_SIZE]
         if "None" in self.input_mode:
             batch_size = data_dict[self.key_input_state][0].shape[1]
-            inputs = torch.zeros(batch_size, 1, self.hidden_size)
+            inputs = torch.zeros(batch_size, self.hidden_size)
             if next(self.parameters()).is_cuda:
                 inputs = inputs.cuda() 
-
         else:
             inputs = data_dict[self.key_inputs]
             if inputs.dim() == 2:
@@ -235,56 +265,56 @@ def forward(self, data_dict):
         # Autoregressive mode - feed back outputs in the input
         if "Autoregression" in self.input_mode:
             activations_partial, hidden = self.rnn_cell(inputs, hidden)
+            activations_partial = self.activation2output(activations_partial)
             activations += [activations_partial]
+
             # Feed back the outputs iteratively
             for i in range(self.autoregression_length - 1):
                 activations_partial, hidden = self.rnn_cell(activations_partial, hidden)
+                activations_partial = self.activation2output(activations_partial)
                 # Add the single step output into list
                 if self.prediction_mode == "Dense":
                     activations += [activations_partial]
             # Reassemble all the outputs from list into an output sequence
             if self.prediction_mode == "Dense":
-                activations = torch.stack(activations, 1)
-            else:
-                activations = activations_partial
+                outputs = torch.cat(activations, 1)
+                # Log softmax - along PREDICTION dim.
+                if self.use_logsoftmax:
+                    outputs = self.log_softmax(outputs)
+                # Add predictions to datadict.
+                data_dict.extend({self.key_predictions: outputs})
+            elif self.prediction_mode == "Last":
+                if self.use_logsoftmax:
+                    outputs = self.log_softmax(activations_partial.squeeze(1))
+                # Add predictions to datadict.
+                data_dict.extend({self.key_predictions: outputs})
+
         # Normal mode - feed the entire input sequence at once
         else:
             activations, hidden = self.rnn_cell(inputs, hidden)
 
-        
-        # Propagate activations through dropout layer.
-        activations = self.dropout(activations)
-
-        if self.prediction_mode == "Dense":
-            # Pass every activation through the output layer.
-            # Reshape to 2D tensor [BATCH_SIZE * SEQ_LEN x HIDDEN_SIZE]
-            outputs = activations.contiguous().view(-1, self.hidden_size)
-
-            # Propagate data through the output layer [BATCH_SIZE * SEQ_LEN x PREDICTION_SIZE]
-            outputs = self.activation2output(outputs)
-
-            # Reshape back to 3D tensor [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE]
-            outputs = outputs.view(activations.size(0), activations.size(1), outputs.size(1))
-
-            # Log softmax - along PREDICTION dim.
-            if self.use_logsoftmax:
-                outputs = self.log_softmax(outputs)
-
-            # Add predictions to datadict.
-            data_dict.extend({self.key_predictions: outputs})
-        elif self.prediction_mode == "Last":
-            # Pass only the last activation through the output layer.
-            outputs = activations.contiguous()[:, -1, :].squeeze()
-            # Propagate data through the output layer [BATCH_SIZE x PREDICTION_SIZE]
-            outputs = self.activation2output(outputs)
-            # Log softmax - along PREDICTION dim.
-            if self.use_logsoftmax:
-                outputs = self.log_softmax(outputs)
-            # Add predictions to datadict.
-            data_dict.extend({self.key_predictions: outputs})
-        elif self.prediction_mode == "None":
-            # Nothing, since we don't want to keep the RNN's outputs
-            pass
+            if self.prediction_mode == "Dense":
+                # Pass every activation through the output layer.
+                outputs = self.activation2output(activations)
+                
+                # Log softmax - along PREDICTION dim.
+                if self.use_logsoftmax:
+                    outputs = self.log_softmax(outputs)
+
+                # Add predictions to datadict.
+                data_dict.extend({self.key_predictions: outputs})
+            elif self.prediction_mode == "Last":
+                outputs = self.activation2output(activations.contiguous()[:, -1, :].unsqueeze(1))
+                outputs = outputs.squeeze(1)
+
+                # Log softmax - along PREDICTION dim.
+                if self.use_logsoftmax:
+                    outputs = self.log_softmax(outputs)
+                # Add predictions to datadict.
+                data_dict.extend({self.key_predictions: outputs})
+            elif self.prediction_mode == "None":
+                # Nothing, since we don't want to keep the RNN's outputs
+                pass
 
         if self.output_last_state:
             data_dict.extend({self.key_output_state: hidden})
diff --git a/ptp/components/models/seq2seq_rnn.py b/ptp/components/models/seq2seq_rnn.py
index 813ab92..16380a8 100644
--- a/ptp/components/models/seq2seq_rnn.py
+++ b/ptp/components/models/seq2seq_rnn.py
@@ -185,25 +185,19 @@ def forward(self, data_dict):
 
         # Encoder
         activations, hidden = self.rnn_cell_enc(inputs, hidden)
+        activations_partial = self.activation2output(activations[:, -1, :])
 
         # Propagate inputs through rnn cell.
-        activations_partial, hidden = self.rnn_cell_dec(activations[:, -1, :].unsqueeze(1), hidden)
-        activations = []
-        activations += [activations_partial]
+        activations_partial, hidden = self.rnn_cell_dec(activations_partial.unsqueeze(1), hidden)
+        activations_partial = activations_partial.squeeze(1)
+        activations_partial = self.activation2output(activations_partial)
+        activations = [activations_partial]
         for i in range(self.autoregression_length - 1):
-            activations_partial, hidden = self.rnn_cell_dec(activations_partial, hidden)
+            activations_partial, hidden = self.rnn_cell_dec(activations_partial.unsqueeze(1), hidden)
+            activations_partial = activations_partial.squeeze(1)
+            activations_partial = self.activation2output(activations_partial)
             activations += [activations_partial]
-        activations = torch.stack(activations, 1)
-
-        # Pass every activation through the output layer.
-        # Reshape to 2D tensor [BATCH_SIZE * SEQ_LEN x HIDDEN_SIZE]
-        outputs = activations.contiguous().view(-1, self.hidden_size)
-
-        # Propagate data through the output layer [BATCH_SIZE * SEQ_LEN x PREDICTION_SIZE]
-        outputs = self.activation2output(outputs)
-
-        # Reshape back to 3D tensor [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE]
-        outputs = outputs.view(activations.size(0), activations.size(1), outputs.size(1))
+        outputs = torch.stack(activations, 1)
 
         # Log softmax - along PREDICTION dim.
         if self.use_logsoftmax: