From 4e7d6c9c8286b1cbc880112f22282406df341996 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Mon, 29 Apr 2019 11:11:49 -0700 Subject: [PATCH 1/5] Add support for in sentence indexer, add ignore index global to nllloss --- configs/default/components/losses/nll_loss.yml | 3 +++ .../components/models/sentence_embeddings.yml | 10 ++++++++-- .../default/components/text/sentence_indexer.yml | 15 +++++++++++++-- ptp/components/losses/nll_loss.py | 9 ++++++++- ptp/components/mixins/word_mappings.py | 3 +++ ptp/components/text/sentence_indexer.py | 6 +++++- ptp/components/utils/word_mappings.py | 8 +++++++- 7 files changed, 47 insertions(+), 7 deletions(-) diff --git a/configs/default/components/losses/nll_loss.yml b/configs/default/components/losses/nll_loss.yml index b49aad0..0f21567 100644 --- a/configs/default/components/losses/nll_loss.yml +++ b/configs/default/components/losses/nll_loss.yml @@ -37,6 +37,9 @@ globals: # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. #################################################################### + # Target value to ignore (masking) + ignore_index: ignore_index + #################################################################### # 4. Keymappings associated with GLOBAL variables that will be SET. #################################################################### diff --git a/configs/default/components/models/sentence_embeddings.yml b/configs/default/components/models/sentence_embeddings.yml index 1bffaf4..16ba8f4 100644 --- a/configs/default/components/models/sentence_embeddings.yml +++ b/configs/default/components/models/sentence_embeddings.yml @@ -11,8 +11,14 @@ data_folder: '~/data/' source_vocabulary_files: '' # Additional tokens that will be added to vocabulary (LOADED) -# This list can be extended, but is a special token used ALWAYS for padding shorter sequences. -additional_tokens: '' +# This list can be extended, but and are special tokens. +# is ALWAYS used for padding shorter sequences. +additional_tokens: ',' + +# Enable (end of sequence) token. +eos_token: False + +export_pad_mapping_to_globals: False # File containing word (LOADED) word_mappings_file: 'word_mappings.csv' diff --git a/configs/default/components/text/sentence_indexer.yml b/configs/default/components/text/sentence_indexer.yml index 34283a6..219dc4f 100644 --- a/configs/default/components/text/sentence_indexer.yml +++ b/configs/default/components/text/sentence_indexer.yml @@ -11,8 +11,14 @@ data_folder: '~/data/' source_vocabulary_files: '' # Additional tokens that will be added to vocabulary (LOADED) -# This list can be extended, but is a special token used ALWAYS for padding shorter sequences. -additional_tokens: '' +# This list can be extended, but and are special tokens. +# is ALWAYS used for padding shorter sequences. +additional_tokens: ',' + +# Enable (end of sequence) token. +eos_token: False + +export_pad_mapping_to_globals: False # File containing word (LOADED) word_mappings_file: 'word_mappings.csv' @@ -68,6 +74,11 @@ globals: # This depends on the import/export configuration flags above. vocabulary_size: vocabulary_size + + # Index of the token + # Will be set only if `export_pad_mapping_to_globals == True` + pad_mapping: pad_mapping + #################################################################### # 5. Keymappings associated with statistics that will be ADDED. #################################################################### diff --git a/ptp/components/losses/nll_loss.py b/ptp/components/losses/nll_loss.py index a6c18b6..ee75212 100644 --- a/ptp/components/losses/nll_loss.py +++ b/ptp/components/losses/nll_loss.py @@ -45,8 +45,15 @@ def __init__(self, name, config): # Get number of targets dimensions. self.num_targets_dims = self.config["num_targets_dims"] + # Get the optional ignore_index. -100 is the default value in PyTorch + self.ignore_index = -100 + try: + self.ignore_index = self.globals["ignore_index"] + except KeyError: + pass + # Set loss. - self.loss_function = nn.NLLLoss() + self.loss_function = nn.NLLLoss(ignore_index=self.ignore_index) def input_data_definitions(self): diff --git a/ptp/components/mixins/word_mappings.py b/ptp/components/mixins/word_mappings.py index 1920574..9ffa4ca 100644 --- a/ptp/components/mixins/word_mappings.py +++ b/ptp/components/mixins/word_mappings.py @@ -72,6 +72,9 @@ def __init__(self): #, name, class_type, config): if word != '' and word not in self.word_to_ix: self.word_to_ix[word] = len(self.word_to_ix) + if self.config["export_pad_mapping_to_globals"]: + self.globals["pad_mapping"] = self.word_to_ix[''] + self.logger.info("Initialized word mappings with vocabulary of size {}".format(len(self.word_to_ix))) # Check if we want to export word mappings to globals. diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py index 3db675b..2132b63 100644 --- a/ptp/components/text/sentence_indexer.py +++ b/ptp/components/text/sentence_indexer.py @@ -54,6 +54,9 @@ def __init__(self, name, config): # Force padding to a fixed length self.fixed_padding = self.config['fixed_padding'] + # Wether to add at the end of sequence + self.enable_eos_token = self.config['eos_token'] + if self.mode_reverse: # We will need reverse (index:word) mapping. self.ix_to_word = dict((v,k) for k,v in self.word_to_ix.items()) @@ -133,6 +136,7 @@ def sentences_to_tensor(self, data_dict): # Get index of padding. pad_index = self.word_to_ix[''] + eos_index = self.word_to_ix[''] if self.enable_eos_token else None outputs_list = [] # Process sentences 1 by 1. @@ -150,7 +154,7 @@ def sentences_to_tensor(self, data_dict): # Apply fixed padding to all sequences if requested # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding if self.fixed_padding > 0: - pad_trunc_list(output_sample, self.fixed_padding, padding_value=pad_index) + pad_trunc_list(output_sample, self.fixed_padding, padding_value=pad_index, eos_value=eos_index) outputs_list.append(self.app_state.LongTensor(output_sample)) diff --git a/ptp/components/utils/word_mappings.py b/ptp/components/utils/word_mappings.py index 32b37e4..61f5e52 100644 --- a/ptp/components/utils/word_mappings.py +++ b/ptp/components/utils/word_mappings.py @@ -133,7 +133,7 @@ def save_word_mappings_to_csv_file(logger, folder, filename, word_to_ix, fieldna logger.info("Saved mappings of size {} to file '{}'".format(len(word_to_ix), file_path)) -def pad_trunc_list(l: list, length: int, padding_value = 0): +def pad_trunc_list(l: list, length: int, padding_value = 0, eos_value = None): """ Will apply padding / clipping to list to meet requested length. Works on the list in-place. @@ -146,7 +146,13 @@ def pad_trunc_list(l: list, length: int, padding_value = 0): :return: None """ + if len(l) < length: + if eos_value is not None: + l.append(eos_value) l.extend([padding_value]*(length-len(l))) + elif len(l) > length: del l[length:] + if eos_value is not None: + l[length-1] = eos_value \ No newline at end of file From d93824e21e49ac7347e9668c02aaaad2f60e1cc5 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Mon, 29 Apr 2019 17:45:19 -0700 Subject: [PATCH 2/5] HACK: add unneeded for the WordMappings class to be happy --- configs/default/components/text/label_indexer.yml | 3 +++ configs/default/components/text/sentence_indexer.yml | 1 + configs/default/components/text/sentence_one_hot_encoder.yml | 3 +++ configs/default/components/text/word_decoder.yml | 3 +++ 4 files changed, 10 insertions(+) diff --git a/configs/default/components/text/label_indexer.yml b/configs/default/components/text/label_indexer.yml index bfe9aa0..d402f43 100644 --- a/configs/default/components/text/label_indexer.yml +++ b/configs/default/components/text/label_indexer.yml @@ -16,6 +16,9 @@ additional_tokens: '' # File containing word (LOADED) word_mappings_file: 'word_mappings.csv' +# HACK: This key is useless here, but needed by parent class. Should be removed/fixed in the future +export_pad_index_to_globals: False + # If set, component will always (re)generate the vocabulary (LOADED) regenerate: False diff --git a/configs/default/components/text/sentence_indexer.yml b/configs/default/components/text/sentence_indexer.yml index 0144fca..c3684e3 100644 --- a/configs/default/components/text/sentence_indexer.yml +++ b/configs/default/components/text/sentence_indexer.yml @@ -18,6 +18,7 @@ additional_tokens: ',' # Enable (end of sequence) token. eos_token: False +# HACK: This key is useless here, but needed by parent class. Should be removed/fixed in the future export_pad_index_to_globals: False # File containing word (LOADED) diff --git a/configs/default/components/text/sentence_one_hot_encoder.yml b/configs/default/components/text/sentence_one_hot_encoder.yml index 3eadd46..17f2be5 100644 --- a/configs/default/components/text/sentence_one_hot_encoder.yml +++ b/configs/default/components/text/sentence_one_hot_encoder.yml @@ -16,6 +16,9 @@ additional_tokens: '' # File containing word (LOADED) word_mappings_file: 'word_mappings.csv' +# HACK: This key is useless here, but needed by parent class. Should be removed/fixed in the future +export_pad_index_to_globals: False + # If set, component will always (re)generate the vocabulary (LOADED) regenerate: False diff --git a/configs/default/components/text/word_decoder.yml b/configs/default/components/text/word_decoder.yml index 738904b..bb6e996 100644 --- a/configs/default/components/text/word_decoder.yml +++ b/configs/default/components/text/word_decoder.yml @@ -13,6 +13,9 @@ source_vocabulary_files: '' # Additional tokens that will be added to vocabulary (LOADED) additional_tokens: '' +# HACK: This key is useless here, but needed by parent class. Should be removed/fixed in the future +export_pad_index_to_globals: False + # File containing word (LOADED) word_mappings_file: 'word_mappings.csv' From 454e0af532226a7803fb27417664f2802bdeef46 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Mon, 29 Apr 2019 17:59:55 -0700 Subject: [PATCH 3/5] Added configs for C4 answer generation, with and without image, using attention decoder --- .../c4_classification/c4_enc_attndec.yml | 8 +- .../c4_enc_attndec_resnet152_ewm_cat_is.yml | 234 ++++++++++++++++++ 2 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml diff --git a/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml b/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml index edea53b..d9347a0 100644 --- a/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml +++ b/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml @@ -6,13 +6,13 @@ training: problem: batch_size: 64 categories: C4 - question_preprocessing: lowercase, remove_punctuation, tokenize + question_preprocessing: lowercase, remove_punctuation, tokenize, random_remove_stop_words #,random_shuffle_words answer_preprocessing: lowercase, remove_punctuation, tokenize export_sample_weights: ~/data/vqa-med/answers.c4.weights.csv sampler: weights: ~/data/vqa-med/answers.c4.weights.csv dataloader: - num_workers: 8 + num_workers: 2 # Termination. terminal_conditions: loss_stop: 1.0e-2 @@ -27,7 +27,7 @@ validation: question_preprocessing: lowercase, remove_punctuation, tokenize answer_preprocessing: lowercase, remove_punctuation, tokenize dataloader: - num_workers: 8 + num_workers: 2 pipeline: name: c4_dec_attndecoder @@ -54,6 +54,8 @@ pipeline: word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv import_word_mappings_from_globals: False export_word_mappings_to_globals: True + export_pad_mapping_to_globals: True + eos_token: True fixed_padding: 10 streams: inputs: answers diff --git a/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml b/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml new file mode 100644 index 0000000..7302e9d --- /dev/null +++ b/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml @@ -0,0 +1,234 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + batch_size: 64 + categories: C4 + question_preprocessing: lowercase, remove_punctuation, tokenize #, random_remove_stop_words #,random_shuffle_words + answer_preprocessing: lowercase, remove_punctuation, tokenize + export_sample_weights: ~/data/vqa-med/answers.c4.weights.csv + batch_size: 32 + sampler: + weights: ~/data/vqa-med/answers.c4.weights.csv + dataloader: + num_workers: 2 + # Termination. + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 1000000 + epoch_limit: -1 + +# Validation parameters: +validation: + problem: + batch_size: 64 + categories: C4 + question_preprocessing: lowercase, remove_punctuation, tokenize + answer_preprocessing: lowercase, remove_punctuation, tokenize + batch_size: 32 + dataloader: + num_workers: 2 + +pipeline: + name: c4_enc_attndec_resnet152_ewm_cat_is + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size,image_size_encoder_input_size, image_size_encoder_output_size] + values: [100, 100, 100, 2, 10] + + # Question embeddings + question_embeddings: + priority: 1.0 + type: SentenceEmbeddings + embeddings_size: 100 + pretrained_embeddings_file: glove.6B.100d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + fixed_padding: 10 + additional_tokens: + streams: + inputs: questions + outputs: embedded_questions + + # Target encoding. + target_indexer: + type: SentenceIndexer + priority: 1.1 + data_folder: ~/data/vqa-med + word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv + import_word_mappings_from_globals: False + export_word_mappings_to_globals: True + export_pad_mapping_to_globals: True + eos_token: True + fixed_padding: 10 + streams: + inputs: answers + outputs: indexed_answers + globals: + vocabulary_size: ans_vocabulary_size + word_mappings: ans_word_mappings + pad_index: ans_pad_index + + # Image encoder. + image_encoder: + priority: 2.0 + type: TorchVisionWrapper + model_type: resnet152 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + # Single layer GRU Encoder + encoder: + type: RecurrentNeuralNetwork + cell_type: GRU + priority: 3 + initial_state: Trainable + hidden_size: 100 + num_layers: 1 + use_logsoftmax: False + output_last_state: True + prediction_mode: Dense + ffn_output: False + dropout_rate: 0.1 + streams: + inputs: embedded_questions + predictions: s2s_encoder_output + output_state: s2s_state_output + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + reshaper_1: + priority: 3.01 + type: ReshapeTensor + input_dims: [1, -1, 100] + output_dims: [-1, 100] + streams: + inputs: s2s_state_output + outputs: s2s_state_output_reshaped + globals: + output_size: s2s_state_output_reshaped_size + + # Element wise multiplication + FF. + question_image_fusion: + priority: 3.1 + type: ElementWiseMultiplication + dropout_rate: 0.5 + streams: + image_encodings: image_activations + question_encodings: s2s_state_output_reshaped + outputs: element_wise_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: element_wise_activation_size + + question_image_ffn: + priority: 3.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + streams: + inputs: element_wise_activations + predictions: question_image_activations + globals: + input_size: element_wise_activation_size + prediction_size: element_wise_activation_size + + reshaper_2: + priority: 3.3 + type: ReshapeTensor + input_dims: [-1, 100] + output_dims: [1, -1, 100] + streams: + inputs: question_image_activations + outputs: question_image_activations_reshaped + globals: + output_size: question_image_activations_reshaped_size + + # Single layer GRU Decoder with attention + decoder: + type: Attn_Decoder_RNN + priority: 4 + hidden_size: 100 + use_logsoftmax: False + autoregression_length: 10 + prediction_mode: Dense + dropout_rate: 0.1 + streams: + inputs: s2s_encoder_output + predictions: s2s_decoder_output + input_state: question_image_activations_reshaped + globals: + input_size: element_wise_activation_size + prediction_size: element_wise_activation_size + + # FF, to resize the from the output size of the seq2seq to the size of the target vector + ff_resize_s2s_output: + type: FeedForwardNetwork + use_logsoftmax: True + dimensions: 3 + priority: 5 + dropout_rate: 0.1 + streams: + inputs: s2s_decoder_output + globals: + input_size: element_wise_activation_size + prediction_size: ans_vocabulary_size + +# Loss + nllloss: + type: NLLLoss + priority: 6 + num_targets_dims: 2 + streams: + targets: indexed_answers + loss: loss + globals: + ignore_index: ans_pad_index + + # Prediction decoding. + prediction_decoder: + type: SentenceIndexer + priority: 10 + # Reverse mode. + reverse: True + # Use distributions as inputs. + use_input_distributions: True + data_folder: ~/data/vqa-med + import_word_mappings_from_globals: True + globals: + word_mappings: ans_word_mappings + streams: + inputs: predictions + outputs: prediction_sentences + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 100.0 + + bleu: + type: BLEUStatistics + priority: 100.2 + globals: + word_mappings: ans_word_mappings + streams: + targets: indexed_answers + + + # Viewers. + viewer: + type: StreamViewer + priority: 100.3 + input_streams: questions,answers,indexed_answers,prediction_sentences + +#: pipeline From a018b09377c11d84c1ced0e0e3e559c286839f34 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Tue, 30 Apr 2019 09:27:31 -0700 Subject: [PATCH 4/5] Add option to ignore words in BLEU --- .../default/components/publishers/bleu_statistics.yml | 3 +++ ptp/components/publishers/bleu_statistics.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/configs/default/components/publishers/bleu_statistics.yml b/configs/default/components/publishers/bleu_statistics.yml index a79a245..c51f387 100644 --- a/configs/default/components/publishers/bleu_statistics.yml +++ b/configs/default/components/publishers/bleu_statistics.yml @@ -13,6 +13,9 @@ use_prediction_distributions: True # TODO! #use_masking: False +# Ignored words - useful for ignoring special tokens +ignored_words: ["", ""] + # Weights of n-grams used when calculating the score. weights: [0.25, 0.25, 0.25, 0.25] diff --git a/ptp/components/publishers/bleu_statistics.py b/ptp/components/publishers/bleu_statistics.py index b303ea9..6432c06 100644 --- a/ptp/components/publishers/bleu_statistics.py +++ b/ptp/components/publishers/bleu_statistics.py @@ -58,6 +58,9 @@ def __init__(self, name, config): # Get masking flag. #self.use_masking = self.config["use_masking"] + # Get ignored words + self.ignored_words = self.config["ignored_words"] + # Retrieve word mappings from globals. word_to_ix = self.globals["word_mappings"] # Construct reverse mapping for faster processing. @@ -144,12 +147,16 @@ def calculate_BLEU(self, data_dict): target_words = [] for t_ind in target_indices: if t_ind in self.ix_to_word.keys(): - target_words.append(self.ix_to_word[t_ind]) + w = self.ix_to_word[t_ind] + if w not in self.ignored_words: + target_words.append(w) # Change prediction indices to words. pred_words = [] for p_ind in pred_indices: if p_ind in self.ix_to_word.keys(): - pred_words.append(self.ix_to_word[p_ind]) + w = self.ix_to_word[p_ind] + if w not in self.ignored_words: + pred_words.append(w) # Calculate BLEU. scores.append(sentence_bleu([target_words], pred_words, self.weights)) #print("TARGET: {}\n".format(target_words)) From d93ad168f866dbe823b32b8b31d458463f0e6f7f Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Tue, 30 Apr 2019 14:55:08 -0700 Subject: [PATCH 5/5] Adjustments --- configs/translation/eng_fra_translation_enc_attndec.yml | 9 ++++++++- .../c4_enc_attndec_resnet152_ewm_cat_is.yml | 7 ++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/configs/translation/eng_fra_translation_enc_attndec.yml b/configs/translation/eng_fra_translation_enc_attndec.yml index 7eab08f..2127b46 100644 --- a/configs/translation/eng_fra_translation_enc_attndec.yml +++ b/configs/translation/eng_fra_translation_enc_attndec.yml @@ -57,7 +57,7 @@ pipeline: source_vocabulary_files: eng-fra/eng.train.txt,eng-fra/eng.valid.txt,eng-fra/eng.test.txt vocabulary_mappings_file: eng-fra/eng.all.tokenized_words regenerate: True - additional_tokens: + additional_tokens: , import_word_mappings_from_globals: False export_word_mappings_to_globals: False fixed_padding: 10 @@ -73,11 +73,15 @@ pipeline: source_vocabulary_files: eng-fra/fra.train.txt,eng-fra/fra.valid.txt,eng-fra/fra.test.txt import_word_mappings_from_globals: False export_word_mappings_to_globals: True + export_pad_mapping_to_globals: True + eos_token: True fixed_padding: 10 + additional_tokens: , regenerate: True streams: inputs: targets outputs: indexed_targets + pad_index: tgt_pad_index # Single layer GRU Encoder encoder: @@ -135,6 +139,8 @@ pipeline: streams: targets: indexed_targets loss: loss + globals: + ignore_index: tgt_pad_index # Prediction decoding. prediction_decoder: @@ -159,6 +165,7 @@ pipeline: bleu: type: BLEUStatistics priority: 100.2 + ignored_words: ["", ""] streams: targets: indexed_targets diff --git a/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml b/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml index 7302e9d..6101f2e 100644 --- a/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml +++ b/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml @@ -13,7 +13,7 @@ training: sampler: weights: ~/data/vqa-med/answers.c4.weights.csv dataloader: - num_workers: 2 + num_workers: 4 # Termination. terminal_conditions: loss_stop: 1.0e-2 @@ -29,7 +29,7 @@ validation: answer_preprocessing: lowercase, remove_punctuation, tokenize batch_size: 32 dataloader: - num_workers: 2 + num_workers: 4 pipeline: name: c4_enc_attndec_resnet152_ewm_cat_is @@ -50,7 +50,7 @@ pipeline: data_folder: ~/data/vqa-med word_mappings_file: questions.all.word.mappings.csv fixed_padding: 10 - additional_tokens: + additional_tokens: , streams: inputs: questions outputs: embedded_questions @@ -64,6 +64,7 @@ pipeline: import_word_mappings_from_globals: False export_word_mappings_to_globals: True export_pad_mapping_to_globals: True + additional_tokens: , eos_token: True fixed_padding: 10 streams: