From df2b5af86c16acb319f4f4fbd6be5a6477aacbca Mon Sep 17 00:00:00 2001
From: Alexis Asseman <33075224+aasseman@users.noreply.github.com>
Date: Thu, 25 Apr 2019 19:42:48 -0700
Subject: [PATCH 1/2] Add fixed padding option to sentence_embeddings,
 sentence_indexer

---
 .../components/models/sentence_embeddings.yml |  6 ++++
 .../components/text/sentence_indexer.yml      |  6 ++++
 ptp/components/models/sentence_embeddings.py  |  9 +++++
 ptp/components/text/sentence_indexer.py       | 33 +++++++++++++++++--
 ptp/components/utils/word_mappings.py         | 18 ++++++++++
 5 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/configs/default/components/models/sentence_embeddings.yml b/configs/default/components/models/sentence_embeddings.yml
index ab3a6da..7ca8987 100644
--- a/configs/default/components/models/sentence_embeddings.yml
+++ b/configs/default/components/models/sentence_embeddings.yml
@@ -25,6 +25,12 @@ import_word_mappings_from_globals: False
 # Flag informing whether word mappings will be exported to globals (LOADED)
 export_word_mappings_to_globals: False
 
+# Fixed padding length
+# -1  -> For each batch, automatically pad to the length of the longest sequence of the batch
+#        (variable from batch to batch)
+# > 0 -> Pad each pad to the chosen length (fixed for all batches)
+fixed_padding: -1
+
 # File containing pretrained embeddings (LOADED)
 # Empty means that no embeddings will be loaded.
 pretrained_embeddings_file: ''
diff --git a/configs/default/components/text/sentence_indexer.yml b/configs/default/components/text/sentence_indexer.yml
index 0921bc7..8ec714f 100644
--- a/configs/default/components/text/sentence_indexer.yml
+++ b/configs/default/components/text/sentence_indexer.yml
@@ -25,6 +25,12 @@ import_word_mappings_from_globals: False
 # Flag informing whether word mappings will be exported to globals (LOADED)
 export_word_mappings_to_globals: False
 
+# Fixed padding length
+# -1  -> For each batch, automatically pad to the length of the longest sequence of the batch
+#        (variable from batch to batch)
+# > 0 -> Pad each pad to the chosen length (fixed for all batches)
+fixed_padding: -1
+
 # Operation mode. If 'reverse' is True, then it will change indices into words (LOADED)
 reverse: False
 
diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py
index 6004e2a..2426d0c 100644
--- a/ptp/components/models/sentence_embeddings.py
+++ b/ptp/components/models/sentence_embeddings.py
@@ -25,6 +25,7 @@
 from ptp.data_types.data_definition import DataDefinition
 
 import ptp.components.utils.embeddings as emb
+from ptp.components.utils.word_mappings import pad_list
 
 
 class SentenceEmbeddings(Model, WordMappings):
@@ -56,6 +57,9 @@ def __init__(self, name, config):
         self.key_inputs = self.stream_keys["inputs"]
         self.key_outputs = self.stream_keys["outputs"]
 
+        # Force padding to a fixed length
+        self.fixed_padding = self.config['fixed_padding']
+
         # Retrieve embeddings size from configuration and export it to globals.
         self.embeddings_size = self.config['embeddings_size']
         self.globals["embeddings_size"] = self.embeddings_size
@@ -120,6 +124,11 @@ def forward(self, data_dict):
                 # Add index to outputs.
                 output_sample.append( output_index )
 
+            # Apply fixed padding to all sequences if requested
+            # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
+            if self.fixed_padding > 0:
+                pad_list(output_sample, self.fixed_padding)
+
             #indices_list.append(self.app_state.FloatTensor(output_sample))
             indices_list.append(self.app_state.LongTensor(output_sample))
 
diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py
index 7cb0ece..4c70d05 100644
--- a/ptp/components/text/sentence_indexer.py
+++ b/ptp/components/text/sentence_indexer.py
@@ -19,6 +19,7 @@
 from ptp.components.component import Component
 from ptp.components.mixins.word_mappings import WordMappings
 from ptp.data_types.data_definition import DataDefinition
+from ptp.components.utils.word_mappings import pad_list
 
 
 class SentenceIndexer(Component, WordMappings):
@@ -50,6 +51,9 @@ def __init__(self, name, config):
         # Read mode from the configuration.
         self.mode_reverse = self.config['reverse']
 
+        # Force padding to a fixed length
+        self.fixed_padding = self.config['fixed_padding']
+
         if self.mode_reverse:
             # We will need reverse (index:word) mapping.
             self.ix_to_word = dict((v,k) for k,v in self.word_to_ix.items())
@@ -140,10 +144,16 @@ def sentences_to_tensor(self, data_dict):
                 # Add index to outputs.
                 output_sample.append( output_index )
 
-            outputs_list.append(output_sample)
+            # Apply fixed padding to all sequences if requested
+            # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
+            if self.fixed_padding > 0:
+                pad_list(output_sample, self.fixed_padding)
+
+            outputs_list.append(self.app_state.LongTensor(output_sample))
 
         # Transform the list of lists to tensor.
-        output = self.app_state.LongTensor(outputs_list)
+        # output = self.app_state.LongTensor(outputs_list)
+        output = torch.nn.utils.rnn.pad_sequence(outputs_list, batch_first=True)
         # Create the returned dict.
         data_dict.extend({self.key_outputs: output})
 
@@ -172,6 +182,12 @@ def tensor_indices_to_sentences(self, data_dict):
                 output_word = self.ix_to_word[token]
                 # Add index to outputs.
                 output_sample.append( output_word )
+
+            # Apply fixed padding to all sequences if requested
+            # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
+            if self.fixed_padding > 0:
+                pad_list(output_sample, self.fixed_padding)
+
             # Add sentence to batch.
             outputs_list.append(output_sample)
 
@@ -204,8 +220,21 @@ def tensor_distributions_to_sentences(self, data_dict):
                 output_word = self.ix_to_word[token]
                 # Add index to outputs.
                 output_sample.append( output_word )
+
+            # Apply fixed padding to all sequences if requested
+            # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
+            if self.fixed_padding > 0:
+                pad_list(output_sample, self.fixed_padding)
+
             # Add sentence to batch.
             outputs_list.append(output_sample)
 
         # Create the returned dict.
         data_dict.extend({self.key_outputs: outputs_list})
+
+    @staticmethod
+    def pad_list(self, l: list, length: int, value = 0):
+        if len(l) < length:
+            l.extend([value]*(length-len(l)))
+        elif len(l) > length:
+            del l[length:]
diff --git a/ptp/components/utils/word_mappings.py b/ptp/components/utils/word_mappings.py
index d43abf6..ee1ce20 100644
--- a/ptp/components/utils/word_mappings.py
+++ b/ptp/components/utils/word_mappings.py
@@ -135,3 +135,21 @@ def save_word_mappings_to_csv_file(logger, folder, filename, word_to_ix, fieldna
             writer.writerow({fieldnames[0]:k, fieldnames[1]: v})
 
     logger.info("Saved mappings of size {} to file '{}'".format(len(word_to_ix), file_path))
+
+def pad_list(l: list, length: int, value = 0):
+    """
+    Will apply padding / clipping to list to meet requested length.
+    Works on the list in-place.
+
+    :param l: List to manipulate
+
+    :param length: Target length
+
+    :param value: Value to fill when padding. Default is int(0).
+
+    :return: None
+    """
+    if len(l) < length:
+        l.extend([value]*(length-len(l)))
+    elif len(l) > length:
+        del l[length:]

From 633553d73d0777a664e706ab211eef2cb7bf3cf1 Mon Sep 17 00:00:00 2001
From: Alexis Asseman <33075224+aasseman@users.noreply.github.com>
Date: Fri, 26 Apr 2019 09:50:46 -0700
Subject: [PATCH 2/2] Refactoring / cleaning

---
 ptp/components/models/sentence_embeddings.py |  4 ++--
 ptp/components/text/sentence_indexer.py      | 15 ++++-----------
 ptp/components/utils/word_mappings.py        |  2 +-
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py
index 2426d0c..78c3065 100644
--- a/ptp/components/models/sentence_embeddings.py
+++ b/ptp/components/models/sentence_embeddings.py
@@ -25,7 +25,7 @@
 from ptp.data_types.data_definition import DataDefinition
 
 import ptp.components.utils.embeddings as emb
-from ptp.components.utils.word_mappings import pad_list
+from ptp.components.utils.word_mappings import pad_trunc_list
 
 
 class SentenceEmbeddings(Model, WordMappings):
@@ -127,7 +127,7 @@ def forward(self, data_dict):
             # Apply fixed padding to all sequences if requested
             # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
             if self.fixed_padding > 0:
-                pad_list(output_sample, self.fixed_padding)
+                pad_trunc_list(output_sample, self.fixed_padding)
 
             #indices_list.append(self.app_state.FloatTensor(output_sample))
             indices_list.append(self.app_state.LongTensor(output_sample))
diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py
index 4c70d05..4450e83 100644
--- a/ptp/components/text/sentence_indexer.py
+++ b/ptp/components/text/sentence_indexer.py
@@ -19,7 +19,7 @@
 from ptp.components.component import Component
 from ptp.components.mixins.word_mappings import WordMappings
 from ptp.data_types.data_definition import DataDefinition
-from ptp.components.utils.word_mappings import pad_list
+from ptp.components.utils.word_mappings import pad_trunc_list
 
 
 class SentenceIndexer(Component, WordMappings):
@@ -147,7 +147,7 @@ def sentences_to_tensor(self, data_dict):
             # Apply fixed padding to all sequences if requested
             # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
             if self.fixed_padding > 0:
-                pad_list(output_sample, self.fixed_padding)
+                pad_trunc_list(output_sample, self.fixed_padding)
 
             outputs_list.append(self.app_state.LongTensor(output_sample))
 
@@ -186,7 +186,7 @@ def tensor_indices_to_sentences(self, data_dict):
             # Apply fixed padding to all sequences if requested
             # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
             if self.fixed_padding > 0:
-                pad_list(output_sample, self.fixed_padding)
+                pad_trunc_list(output_sample, self.fixed_padding)
 
             # Add sentence to batch.
             outputs_list.append(output_sample)
@@ -224,17 +224,10 @@ def tensor_distributions_to_sentences(self, data_dict):
             # Apply fixed padding to all sequences if requested
             # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding
             if self.fixed_padding > 0:
-                pad_list(output_sample, self.fixed_padding)
+                pad_trunc_list(output_sample, self.fixed_padding)
 
             # Add sentence to batch.
             outputs_list.append(output_sample)
 
         # Create the returned dict.
         data_dict.extend({self.key_outputs: outputs_list})
-
-    @staticmethod
-    def pad_list(self, l: list, length: int, value = 0):
-        if len(l) < length:
-            l.extend([value]*(length-len(l)))
-        elif len(l) > length:
-            del l[length:]
diff --git a/ptp/components/utils/word_mappings.py b/ptp/components/utils/word_mappings.py
index ee1ce20..5b94350 100644
--- a/ptp/components/utils/word_mappings.py
+++ b/ptp/components/utils/word_mappings.py
@@ -136,7 +136,7 @@ def save_word_mappings_to_csv_file(logger, folder, filename, word_to_ix, fieldna
 
     logger.info("Saved mappings of size {} to file '{}'".format(len(word_to_ix), file_path))
 
-def pad_list(l: list, length: int, value = 0):
+def pad_trunc_list(l: list, length: int, value = 0):
     """
     Will apply padding / clipping to list to meet requested length.
     Works on the list in-place.