From df2b5af86c16acb319f4f4fbd6be5a6477aacbca Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Thu, 25 Apr 2019 19:42:48 -0700 Subject: [PATCH 1/2] Add fixed padding option to sentence_embeddings, sentence_indexer --- .../components/models/sentence_embeddings.yml | 6 ++++ .../components/text/sentence_indexer.yml | 6 ++++ ptp/components/models/sentence_embeddings.py | 9 +++++ ptp/components/text/sentence_indexer.py | 33 +++++++++++++++++-- ptp/components/utils/word_mappings.py | 18 ++++++++++ 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/configs/default/components/models/sentence_embeddings.yml b/configs/default/components/models/sentence_embeddings.yml index ab3a6da..7ca8987 100644 --- a/configs/default/components/models/sentence_embeddings.yml +++ b/configs/default/components/models/sentence_embeddings.yml @@ -25,6 +25,12 @@ import_word_mappings_from_globals: False # Flag informing whether word mappings will be exported to globals (LOADED) export_word_mappings_to_globals: False +# Fixed padding length +# -1 -> For each batch, automatically pad to the length of the longest sequence of the batch +# (variable from batch to batch) +# > 0 -> Pad each pad to the chosen length (fixed for all batches) +fixed_padding: -1 + # File containing pretrained embeddings (LOADED) # Empty means that no embeddings will be loaded. pretrained_embeddings_file: '' diff --git a/configs/default/components/text/sentence_indexer.yml b/configs/default/components/text/sentence_indexer.yml index 0921bc7..8ec714f 100644 --- a/configs/default/components/text/sentence_indexer.yml +++ b/configs/default/components/text/sentence_indexer.yml @@ -25,6 +25,12 @@ import_word_mappings_from_globals: False # Flag informing whether word mappings will be exported to globals (LOADED) export_word_mappings_to_globals: False +# Fixed padding length +# -1 -> For each batch, automatically pad to the length of the longest sequence of the batch +# (variable from batch to batch) +# > 0 -> Pad each pad to the chosen length (fixed for all batches) +fixed_padding: -1 + # Operation mode. If 'reverse' is True, then it will change indices into words (LOADED) reverse: False diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py index 6004e2a..2426d0c 100644 --- a/ptp/components/models/sentence_embeddings.py +++ b/ptp/components/models/sentence_embeddings.py @@ -25,6 +25,7 @@ from ptp.data_types.data_definition import DataDefinition import ptp.components.utils.embeddings as emb +from ptp.components.utils.word_mappings import pad_list class SentenceEmbeddings(Model, WordMappings): @@ -56,6 +57,9 @@ def __init__(self, name, config): self.key_inputs = self.stream_keys["inputs"] self.key_outputs = self.stream_keys["outputs"] + # Force padding to a fixed length + self.fixed_padding = self.config['fixed_padding'] + # Retrieve embeddings size from configuration and export it to globals. self.embeddings_size = self.config['embeddings_size'] self.globals["embeddings_size"] = self.embeddings_size @@ -120,6 +124,11 @@ def forward(self, data_dict): # Add index to outputs. output_sample.append( output_index ) + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + #indices_list.append(self.app_state.FloatTensor(output_sample)) indices_list.append(self.app_state.LongTensor(output_sample)) diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py index 7cb0ece..4c70d05 100644 --- a/ptp/components/text/sentence_indexer.py +++ b/ptp/components/text/sentence_indexer.py @@ -19,6 +19,7 @@ from ptp.components.component import Component from ptp.components.mixins.word_mappings import WordMappings from ptp.data_types.data_definition import DataDefinition +from ptp.components.utils.word_mappings import pad_list class SentenceIndexer(Component, WordMappings): @@ -50,6 +51,9 @@ def __init__(self, name, config): # Read mode from the configuration. self.mode_reverse = self.config['reverse'] + # Force padding to a fixed length + self.fixed_padding = self.config['fixed_padding'] + if self.mode_reverse: # We will need reverse (index:word) mapping. self.ix_to_word = dict((v,k) for k,v in self.word_to_ix.items()) @@ -140,10 +144,16 @@ def sentences_to_tensor(self, data_dict): # Add index to outputs. output_sample.append( output_index ) - outputs_list.append(output_sample) + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + + outputs_list.append(self.app_state.LongTensor(output_sample)) # Transform the list of lists to tensor. - output = self.app_state.LongTensor(outputs_list) + # output = self.app_state.LongTensor(outputs_list) + output = torch.nn.utils.rnn.pad_sequence(outputs_list, batch_first=True) # Create the returned dict. data_dict.extend({self.key_outputs: output}) @@ -172,6 +182,12 @@ def tensor_indices_to_sentences(self, data_dict): output_word = self.ix_to_word[token] # Add index to outputs. output_sample.append( output_word ) + + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + # Add sentence to batch. outputs_list.append(output_sample) @@ -204,8 +220,21 @@ def tensor_distributions_to_sentences(self, data_dict): output_word = self.ix_to_word[token] # Add index to outputs. output_sample.append( output_word ) + + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + # Add sentence to batch. outputs_list.append(output_sample) # Create the returned dict. data_dict.extend({self.key_outputs: outputs_list}) + + @staticmethod + def pad_list(self, l: list, length: int, value = 0): + if len(l) < length: + l.extend([value]*(length-len(l))) + elif len(l) > length: + del l[length:] diff --git a/ptp/components/utils/word_mappings.py b/ptp/components/utils/word_mappings.py index d43abf6..ee1ce20 100644 --- a/ptp/components/utils/word_mappings.py +++ b/ptp/components/utils/word_mappings.py @@ -135,3 +135,21 @@ def save_word_mappings_to_csv_file(logger, folder, filename, word_to_ix, fieldna writer.writerow({fieldnames[0]:k, fieldnames[1]: v}) logger.info("Saved mappings of size {} to file '{}'".format(len(word_to_ix), file_path)) + +def pad_list(l: list, length: int, value = 0): + """ + Will apply padding / clipping to list to meet requested length. + Works on the list in-place. + + :param l: List to manipulate + + :param length: Target length + + :param value: Value to fill when padding. Default is int(0). + + :return: None + """ + if len(l) < length: + l.extend([value]*(length-len(l))) + elif len(l) > length: + del l[length:] From 633553d73d0777a664e706ab211eef2cb7bf3cf1 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Fri, 26 Apr 2019 09:50:46 -0700 Subject: [PATCH 2/2] Refactoring / cleaning --- ptp/components/models/sentence_embeddings.py | 4 ++-- ptp/components/text/sentence_indexer.py | 15 ++++----------- ptp/components/utils/word_mappings.py | 2 +- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py index 2426d0c..78c3065 100644 --- a/ptp/components/models/sentence_embeddings.py +++ b/ptp/components/models/sentence_embeddings.py @@ -25,7 +25,7 @@ from ptp.data_types.data_definition import DataDefinition import ptp.components.utils.embeddings as emb -from ptp.components.utils.word_mappings import pad_list +from ptp.components.utils.word_mappings import pad_trunc_list class SentenceEmbeddings(Model, WordMappings): @@ -127,7 +127,7 @@ def forward(self, data_dict): # Apply fixed padding to all sequences if requested # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding if self.fixed_padding > 0: - pad_list(output_sample, self.fixed_padding) + pad_trunc_list(output_sample, self.fixed_padding) #indices_list.append(self.app_state.FloatTensor(output_sample)) indices_list.append(self.app_state.LongTensor(output_sample)) diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py index 4c70d05..4450e83 100644 --- a/ptp/components/text/sentence_indexer.py +++ b/ptp/components/text/sentence_indexer.py @@ -19,7 +19,7 @@ from ptp.components.component import Component from ptp.components.mixins.word_mappings import WordMappings from ptp.data_types.data_definition import DataDefinition -from ptp.components.utils.word_mappings import pad_list +from ptp.components.utils.word_mappings import pad_trunc_list class SentenceIndexer(Component, WordMappings): @@ -147,7 +147,7 @@ def sentences_to_tensor(self, data_dict): # Apply fixed padding to all sequences if requested # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding if self.fixed_padding > 0: - pad_list(output_sample, self.fixed_padding) + pad_trunc_list(output_sample, self.fixed_padding) outputs_list.append(self.app_state.LongTensor(output_sample)) @@ -186,7 +186,7 @@ def tensor_indices_to_sentences(self, data_dict): # Apply fixed padding to all sequences if requested # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding if self.fixed_padding > 0: - pad_list(output_sample, self.fixed_padding) + pad_trunc_list(output_sample, self.fixed_padding) # Add sentence to batch. outputs_list.append(output_sample) @@ -224,17 +224,10 @@ def tensor_distributions_to_sentences(self, data_dict): # Apply fixed padding to all sequences if requested # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding if self.fixed_padding > 0: - pad_list(output_sample, self.fixed_padding) + pad_trunc_list(output_sample, self.fixed_padding) # Add sentence to batch. outputs_list.append(output_sample) # Create the returned dict. data_dict.extend({self.key_outputs: outputs_list}) - - @staticmethod - def pad_list(self, l: list, length: int, value = 0): - if len(l) < length: - l.extend([value]*(length-len(l))) - elif len(l) > length: - del l[length:] diff --git a/ptp/components/utils/word_mappings.py b/ptp/components/utils/word_mappings.py index ee1ce20..5b94350 100644 --- a/ptp/components/utils/word_mappings.py +++ b/ptp/components/utils/word_mappings.py @@ -136,7 +136,7 @@ def save_word_mappings_to_csv_file(logger, folder, filename, word_to_ix, fieldna logger.info("Saved mappings of size {} to file '{}'".format(len(word_to_ix), file_path)) -def pad_list(l: list, length: int, value = 0): +def pad_trunc_list(l: list, length: int, value = 0): """ Will apply padding / clipping to list to meet requested length. Works on the list in-place.