From f39ad630a0c259dd73a253663ff5439a00c8cd2d Mon Sep 17 00:00:00 2001 From: Deepta Rajan Date: Sun, 5 May 2019 20:10:22 -0700 Subject: [PATCH 1/3] new config coatt+mfb, clean ups, setting globals output_size --- ...multimodal_factorized_bilinear_pooling.yml | 3 + ..._class_lstm_resnet50_coattn_mfb_cat_is.yml | 133 ++++++ ...snet50_coattn_mfb_is_cat_ffn_c123_loss.yml | 410 ++++++++++++++++++ ptp/components/models/vqa/attention.py | 19 +- .../multimodal_factorized_bilinear_pooling.py | 3 + ptp/components/models/vqa/self_attention.py | 6 +- 6 files changed, 566 insertions(+), 8 deletions(-) create mode 100644 configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml create mode 100644 configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml diff --git a/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml b/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml index c8869fd..b273af0 100644 --- a/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml +++ b/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml @@ -47,6 +47,9 @@ globals: # 4. Keymappings associated with GLOBAL variables that will be SET. #################################################################### + # Size of the output (SET) + output_size: output_size + #################################################################### # 5. Keymappings associated with statistics that will be ADDED. #################################################################### diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml new file mode 100644 index 0000000..cf8bd70 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml @@ -0,0 +1,133 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + problem: + batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + +validation: + problem: + batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + + +pipeline: + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, question_attention_activation_size, image_attention_activation_size, pooling_activation_size] + values: [200, 800, 4096, 512] + + ################# PIPE 0: question ################# + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 100 + pretrained_embeddings_file: glove.6B.100d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: GRU + prediction_mode: Dense + use_logsoftmax: False + output_last_state: False + initial_state: Trainable + dropout_rate: 0.1 + hidden_size: 128 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # Self Attention for question. + question_attention: + priority: 1.4 + type: SelfAttention + latent_size: 128 + num_attention_heads: 4 + streams: + question_encodings: question_activations + outputs: question_attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: question_attention_activation_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model_type: resnet50 + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + + image_attention: + priority: 2.2 + type: VQA_Attention + dropout_rate: 0.3 + latent_size: 1024 + output_mode: 'Image' + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_attention_activations + outputs: image_attention_activations + globals: + question_encoding_size: question_attention_activation_size + output_size: image_attention_activation_size + + ################# PIPE 3: image-question fusion ################# + # MFB + question_image_fusion: + priority: 3.1 + type: MultimodalFactorizedBilinearPooling + dropout_rate: 0.3 + latent_size: 512 + pool_factor: 2 + streams: + image_encodings: image_attention_activations + question_encodings: question_attention_activations + outputs: pooling_activations + globals: + image_encoding_size: image_attention_activation_size + question_encoding_size: question_attention_activation_size + output_size: pooling_activation_size + + classifier: + priority: 4.1 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.2 + streams: + inputs: pooling_activations + globals: + input_size: pooling_activation_size + prediction_size: vocabulary_size_c2 + + + #: pipeline diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml new file mode 100644 index 0000000..480f4c5 --- /dev/null +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml @@ -0,0 +1,410 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +hyperparameters: + # In here I am putting some of the hyperparameters from spreadsheet. + + question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize + # Accepted formats: a,b,c or [a,b,c] + # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all + + image_preprocessing: &image_preprocessing normalize + # Accepted formats: a,b,c or [a,b,c] + # none | random_affine | random_horizontal_flip | normalize | all + + # Image encoder. + image_encoder_model: &image_encoder_model resnet50 + # Options: vgg16 | densenet121 | resnet152 | resnet50 + # image_encoder_output_size_val: &image_encoder_output_size_val 2048 + image_attention_multihead_size_val: &image_attention_multihead_size_val 2 + image_attention_output_size_val: &image_attention_output_size_val 4096 + + # Question encoder. + question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt + # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled + question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50 + question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128 + question_encoder_output_size_val: &question_encoder_output_size_val 200 + question_attention_multihead_size_val: &question_attention_multihead_size_val 4 + question_attention_output_size_val: &question_attention_output_size_val 800 + + # Fusion I: image + question + question_image_fusion_type_val: &question_image_fusion_type MultimodalFactorizedBilinearPooling + # Options: ElementWiseMultiplication | MultimodalFactorizedBilinearPooling (component: question_image_fusion) + question_image_fusion_size_val: &question_image_fusion_size_val 512 + + # Image size encoder. + image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10 + + # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val) + question_image_size_fusion_size_val: &question_image_size_fusion_size_val 522 + + # Final classifier: FFN. + answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100] + + batch_size: &batch_size 64 + +# Training parameters: +training: + problem: + batch_size: *batch_size + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + + # Optimizer parameters: + optimizer: + name: Adam + lr: 0.0001 + + # Terminal conditions: + terminal_conditions: + loss_stop: 1.0e-3 + episode_limit: 5000 #10000 + epoch_limit: -1 + +# Validation parameters: +validation: + problem: + batch_size: *batch_size + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_attention_output_size, fused_activation_size] + values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *image_attention_output_size_val, *question_image_fusion_size_val] + # keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size] + # values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val] + + # Statistics. + batch_size: + priority: 0.1 + type: BatchSizeStatistics + + # Answer encoding. + pipe1_all_answer_indexer: + priority: 0.2 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: answers_ids + globals: + vocabulary_size: vocabulary_size_c123_binary_yn + word_mappings: word_mappings_c123_binary_yn + + + ################# PIPE 0: QUESTION CATEGORIZATION ################# + + # Add global variables - the ones related to only question categorization. + pipe0_global_publisher: + priority: 0.3 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [pipe0_question_encoder_output_size] + values: [100] + + # Model 1: question embeddings + pipe0_question_embeddings: + priority: 0.4 + type: SentenceEmbeddings + # LOAD AND FREEZE # + load: + file: ~/Repositories/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + globals: + embeddings_size: pipe0_embeddings_size + + # Model 2: question RNN + pipe0_lstm: + priority: 0.5 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/Repositories/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_question_activations + globals: + input_size: pipe0_embeddings_size + prediction_size: pipe0_question_encoder_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.6 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/Repositories/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + dropout_rate: 0.7 + streams: + inputs: pipe0_question_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: pipe0_question_encoder_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.8 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + priority: 0.9 + type: AccuracyStatistics + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + priority: 1.1 + type: SentenceEmbeddings + embeddings_size: *question_encoder_embeddings_size_val + pretrained_embeddings_file: *question_encoder_embeddings + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + globals: + embeddings_size: pipe1_embeddings_size + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: GRU + hidden_size: *question_encoder_lstm_size_val + prediction_mode: Dense + initial_state: Trainable + use_logsoftmax: False + output_last_state: False + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: pipe1_embeddings_size + prediction_size: question_encoder_output_size + + + # Model 3: self attention for question. + question_attention: + priority: 1.3 + type: SelfAttention + latent_size: *question_encoder_lstm_size_val + num_attention_heads: 4 + streams: + question_encodings: question_activations + outputs: question_attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: question_attention_output_size + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model: *image_encoder_model + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + # globals: + # output_size: image_encoder_output_size + + image_attention: + priority: 2.2 + type: VQA_Attention + dropout_rate: 0.3 + latent_size: 256 + output_mode: 'Image' + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_attention_activations + outputs: image_attention_activations + globals: + question_encoding_size: question_attention_output_size + image_encoding_size: image_attention_output_size + + ################# PIPE 3: image-question fusion ################# + # MFB. + question_image_fusion: + priority: 3.1 + type: *question_image_fusion_type + dropout_rate: 0.5 + latent_size: 512 + pool_factor: 2 + streams: + image_encodings: image_attention_activations + question_encodings: question_attention_activations + outputs: fused_activations + globals: + image_encoding_size: image_attention_output_size + question_encoding_size: question_attention_output_size + output_size: fused_activation_size + + ################# PIPE 4: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + priority: 4.1 + type: FeedForwardNetwork + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + # question_image_ffn: + # priority: 4.2 + # type: FeedForwardNetwork + # # hidden_sizes: [*question_image_fusion_size_val] + # dropout_rate: 0.5 + # use_logsoftmax: False + # streams: + # inputs: fused_activations + # predictions: question_image_activations + # globals: + # input_size: fused_activation_size + # prediction_size: fused_activation_size + + ################# PIPE 5: image-question-image size fusion ################# + + # 5th subpipeline: concatenation + concat: + priority: 5.1 + type: Concatenation + input_streams: [fused_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]] + output_dims: [-1,*question_image_size_fusion_size_val] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + ################# PIPE 6: C1 + C2 + C3 questions ################# + + # Model 4: FFN C123 answering + pipe6_c123_answer_classifier: + priority: 6.3 + type: FeedForwardNetwork + hidden: *answer_classifier_hidden_sizes_val + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe6_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_binary_yn + + pipe6_c123_nllloss: + priority: 6.4 + type: NLLLoss + targets_dim: 1 + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + loss: pipe6_c123_loss + + pipe6_c123_precision_recall: + priority: 6.5 + type: PrecisionRecallStatistics + use_word_mappings: True + show_class_scores: True + #show_confusion_matrix: True + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + globals: + word_mappings: word_mappings_c123_binary_yn + statistics: + precision: pipe6_c123_precision + recall: pipe6_c123_recall + f1score: pipe6_c123_f1score + + # C123 Predictions decoder. + pipe5_c123_prediction_decoder: + priority: 6.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe6_c123_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_binary_yn + + ################# PIPE 7: MERGE ANSWERS ################# + + # Viewers. + viewer: + priority: 7.3 + type: StreamViewer + input_streams: + tokenized_questions, + category_names, pipe0_predicted_question_categories_names, + answers, predicted_answers + + +#: pipeline diff --git a/ptp/components/models/vqa/attention.py b/ptp/components/models/vqa/attention.py index 88c868f..5f26ed2 100644 --- a/ptp/components/models/vqa/attention.py +++ b/ptp/components/models/vqa/attention.py @@ -60,8 +60,15 @@ def __init__(self, name, config): self.latent_size = self.config["latent_size"] self.num_attention_heads = self.config["num_attention_heads"] + # Output new attention weighted image encoding only, or both image and question image_encodings + self.output_mode = self.config["output_mode"] + # Output feature size - self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size + if(self.output_mode == 'Image'): + self.output_size = self.feature_maps_depth*self.num_attention_heads + elif(self.output_mode == 'None'): + self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size + # Export to globals. self.globals["output_size"] = self.output_size @@ -137,8 +144,12 @@ def forward(self, data_dict): attention_enc_img = apply_attention(enc_img, attention) # [48, 2048, 7, 7], [48, 2, 7, 7] # print("attention im", attention_enc_img.shape) - # Fusion -- Concatenate attention-weighted image encodings and question encodings. - outputs = torch.cat([attention_enc_img, latent_q], dim=1) + if(self.output_mode == 'Image'): + # Output attention-weighted image encodings + outputs = attention_enc_img + elif(self.output_mode == 'None'): + # Fusion -- Concatenate attention-weighted image encodings and question encodings. + outputs = torch.cat([attention_enc_img, latent_q], dim=1) # print("outputs", outputs.shape) # Add predictions to datadict. data_dict.extend({self.key_outputs: outputs}) @@ -165,4 +176,4 @@ def apply_attention(input, attention): attention = torch.nn.functional.softmax(attention, dim=-1).unsqueeze(2) # [n, g, 1, s] [batch, multi_head, 1, height*width] [48, 2, 1, 7*7] weighted = attention * input # [n, g, c, s] [48, 2, 2048, 7*7] weighted_mean = weighted.sum(dim=-1) # [n, g, c] [48, 2, 2048] - return weighted_mean.view(n, -1) # [48, 4196] + return weighted_mean.view(n, -1) # [48, 4096] diff --git a/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py index f09b374..407ed44 100644 --- a/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py +++ b/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py @@ -59,6 +59,9 @@ def __init__(self, name, config): # Output feature size self.output_size = self.latent_size + # Export to globals. + self.globals["output_size"] = self.output_size + # Map image and question encodings to a common latent space of dimension 'latent_size'. self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.latent_size*self.factor) self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.latent_size*self.factor) diff --git a/ptp/components/models/vqa/self_attention.py b/ptp/components/models/vqa/self_attention.py index 01adc18..080f9a9 100644 --- a/ptp/components/models/vqa/self_attention.py +++ b/ptp/components/models/vqa/self_attention.py @@ -104,14 +104,12 @@ def forward(self, data_dict): self.Attention = torch.softmax(self.W2(self.activation(self.W1(input_enc))), dim = 1) # [48, 8, 4] [batch, num_words, num_heads] # Multiply attention weights with question encoding - input_enc_weighted = torch.matmul(self.Attention.transpose(1,2),input_enc) - print("input_enc_weighted", input_enc_weighted.shape) + input_enc_weighted = torch.matmul(self.Attention.transpose(1,2),input_enc) # [48, 4, 100] [batch, num_heads, embed_dim] # Concatenate features from multi-head attention - outputs = input_enc_weighted.view(batch_size, -1) + outputs = input_enc_weighted.view(batch_size, -1) # [48, 400] [batch, num_heads*embed_dim] # # Alternatively: combine multi-head attention using a mean or sum operation # outputs = torch.sum(input_enc_weighted,1)/self.num_attention_heads - print("outputs", outputs.shape) # Add predictions to datadict. data_dict.extend({self.key_outputs: outputs}) From 988b8096924315232b3e28c2045851c47239548c Mon Sep 17 00:00:00 2001 From: Deepta Rajan Date: Mon, 6 May 2019 13:49:01 -0700 Subject: [PATCH 2/3] fixed 'output_mode' in VQA attention --- .../components/models/vqa/attention.yml | 7 +- ...multimodal_factorized_bilinear_pooling.yml | 3 - ...snet50_coattn_mfb_is_cat_ffn_c123_loss.yml | 131 ++---------------- ptp/components/models/vqa/attention.py | 4 +- .../multimodal_factorized_bilinear_pooling.py | 1 + 5 files changed, 20 insertions(+), 126 deletions(-) diff --git a/configs/default/components/models/vqa/attention.yml b/configs/default/components/models/vqa/attention.yml index 4018be1..4a65a76 100644 --- a/configs/default/components/models/vqa/attention.yml +++ b/configs/default/components/models/vqa/attention.yml @@ -14,6 +14,11 @@ latent_size: 100 # Number of attention heads (LOADED) num_attention_heads: 2 +# Type of output returned +# Options: Image | Fusion +# Details: attention-weighted image | concatenation of attention-weighted image and RNN encoded question +output_mode: Fusion + streams: #################################################################### @@ -42,7 +47,7 @@ globals: # Depth of the features tensor (RETRIEVED) feature_maps_depth: feature_maps_depth - + # Size of the question encodings input (RETRIEVED) question_encoding_size: question_encoding_size diff --git a/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml b/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml index b273af0..1c55fe9 100644 --- a/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml +++ b/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml @@ -40,9 +40,6 @@ globals: # Size of the question encodings input (RETRIEVED) question_encoding_size: question_encoding_size - # Size of the output (RETRIEVED) - output_size: output_size - #################################################################### # 4. Keymappings associated with GLOBAL variables that will be SET. #################################################################### diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml index 480f4c5..e7010be 100644 --- a/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml @@ -17,12 +17,12 @@ hyperparameters: # Options: vgg16 | densenet121 | resnet152 | resnet50 # image_encoder_output_size_val: &image_encoder_output_size_val 2048 image_attention_multihead_size_val: &image_attention_multihead_size_val 2 - image_attention_output_size_val: &image_attention_output_size_val 4096 + # image_attention_output_size_val: &image_attention_output_size_val 4096 # Question encoder. - question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt + question_encoder_embeddings: &question_encoder_embeddings glove.6B.100d.txt # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled - question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50 + question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 100 question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128 question_encoder_output_size_val: &question_encoder_output_size_val 200 question_attention_multihead_size_val: &question_attention_multihead_size_val 4 @@ -66,7 +66,7 @@ training: # Terminal conditions: terminal_conditions: loss_stop: 1.0e-3 - episode_limit: 5000 #10000 + episode_limit: 2000 #10000 epoch_limit: -1 # Validation parameters: @@ -90,10 +90,8 @@ pipeline: priority: 0 type: GlobalVariablePublisher # Add input_size to globals. - keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_attention_output_size, fused_activation_size] - values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *image_attention_output_size_val, *question_image_fusion_size_val] - # keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size] - # values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val] + keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, fused_activation_size] + values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *question_image_fusion_size_val] # Statistics. batch_size: @@ -115,98 +113,6 @@ pipeline: vocabulary_size: vocabulary_size_c123_binary_yn word_mappings: word_mappings_c123_binary_yn - - ################# PIPE 0: QUESTION CATEGORIZATION ################# - - # Add global variables - the ones related to only question categorization. - pipe0_global_publisher: - priority: 0.3 - type: GlobalVariablePublisher - # Add input_size to globals. - keys: [pipe0_question_encoder_output_size] - values: [100] - - # Model 1: question embeddings - pipe0_question_embeddings: - priority: 0.4 - type: SentenceEmbeddings - # LOAD AND FREEZE # - load: - file: ~/Repositories/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt - model: question_embeddings - freeze: True - ################### - embeddings_size: 50 - pretrained_embeddings_file: glove.6B.50d.txt - data_folder: ~/data/vqa-med - word_mappings_file: questions.all.word.mappings.csv - streams: - inputs: tokenized_questions - outputs: pipe0_embedded_questions - globals: - embeddings_size: pipe0_embeddings_size - - # Model 2: question RNN - pipe0_lstm: - priority: 0.5 - type: RecurrentNeuralNetwork - cell_type: LSTM - # LOAD AND FREEZE # - load: - file: ~/Repositories/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt - model: lstm - freeze: True - ################### - prediction_mode: Last - initial_state: Trainable - use_logsoftmax: False - streams: - inputs: pipe0_embedded_questions - predictions: pipe0_question_activations - globals: - input_size: pipe0_embeddings_size - prediction_size: pipe0_question_encoder_output_size - - # Model 3: FFN question category - pipe0_classifier: - priority: 0.6 - type: FeedForwardNetwork - # LOAD AND FREEZE # - load: - file: ~/Repositories/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt - model: classifier - freeze: True - ################### - hidden: [50] - dropout_rate: 0.7 - streams: - inputs: pipe0_question_activations - predictions: pipe0_predicted_question_categories_preds - globals: - input_size: pipe0_question_encoder_output_size # Set by global publisher - prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK - - pipe0_category_decoder: - priority: 0.8 - type: WordDecoder - # Use the same word mappings as label indexer. - import_word_mappings_from_globals: True - streams: - inputs: pipe0_predicted_question_categories_preds - outputs: pipe0_predicted_question_categories_names - globals: - vocabulary_size: num_categories - word_mappings: category_word_mappings - - pipe0_category_accuracy: - priority: 0.9 - type: AccuracyStatistics - streams: - targets: category_ids - predictions: pipe0_predicted_question_categories_preds - statistics: - accuracy: categorization_accuracy - ################# PIPE 1: SHARED QUESTION ENCODER ################# # Model 1: question embeddings @@ -260,13 +166,12 @@ pipeline: image_encoder: priority: 2.1 type: TorchVisionWrapper - model: *image_encoder_model + model_type: *image_encoder_model return_feature_maps: True streams: inputs: images outputs: feature_maps - # globals: - # output_size: image_encoder_output_size + image_attention: priority: 2.2 @@ -281,7 +186,7 @@ pipeline: outputs: image_attention_activations globals: question_encoding_size: question_attention_output_size - image_encoding_size: image_attention_output_size + output_size: image_attention_output_size ################# PIPE 3: image-question fusion ################# # MFB. @@ -298,7 +203,7 @@ pipeline: globals: image_encoding_size: image_attention_output_size question_encoding_size: question_attention_output_size - output_size: fused_activation_size + # output_size: image_attention_output_size #fused_activation_size ################# PIPE 4: SHARED IMAGE SIZE ENCODER ################# @@ -314,19 +219,6 @@ pipeline: input_size: image_size_encoder_input_size prediction_size: image_size_encoder_output_size - # question_image_ffn: - # priority: 4.2 - # type: FeedForwardNetwork - # # hidden_sizes: [*question_image_fusion_size_val] - # dropout_rate: 0.5 - # use_logsoftmax: False - # streams: - # inputs: fused_activations - # predictions: question_image_activations - # globals: - # input_size: fused_activation_size - # prediction_size: fused_activation_size - ################# PIPE 5: image-question-image size fusion ################# # 5th subpipeline: concatenation @@ -405,6 +297,3 @@ pipeline: tokenized_questions, category_names, pipe0_predicted_question_categories_names, answers, predicted_answers - - -#: pipeline diff --git a/ptp/components/models/vqa/attention.py b/ptp/components/models/vqa/attention.py index 5f26ed2..4a04b66 100644 --- a/ptp/components/models/vqa/attention.py +++ b/ptp/components/models/vqa/attention.py @@ -66,8 +66,10 @@ def __init__(self, name, config): # Output feature size if(self.output_mode == 'Image'): self.output_size = self.feature_maps_depth*self.num_attention_heads - elif(self.output_mode == 'None'): + elif(self.output_mode == 'Fusion'): self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size + else: + print("'output_mode' unspecified for VQA Attention in config") #TODO: find a better way to report corner case issue # Export to globals. self.globals["output_size"] = self.output_size diff --git a/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py index 407ed44..e9fcc33 100644 --- a/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py +++ b/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py @@ -116,6 +116,7 @@ def forward(self, data_dict): # Element-wise mutliplication of image and question encodings enc_z = latent_img * latent_q # [48, 512] + # Dropout regularization enc_z = self.dropout(enc_z) enc_z = enc_z.view(enc_z.size(0), self.latent_size, self.factor) # [48, 256, 2] From 286e5968ab6c4cbd8ac2c003538ed47fed7c6626 Mon Sep 17 00:00:00 2001 From: Deepta Rajan Date: Mon, 6 May 2019 16:56:26 -0700 Subject: [PATCH 3/3] c1234 config --- ...vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml | 299 ++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml new file mode 100644 index 0000000..5e7a511 --- /dev/null +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml @@ -0,0 +1,299 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +hyperparameters: + # In here I am putting some of the hyperparameters from spreadsheet. + + question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize + # Accepted formats: a,b,c or [a,b,c] + # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all + + image_preprocessing: &image_preprocessing normalize + # Accepted formats: a,b,c or [a,b,c] + # none | random_affine | random_horizontal_flip | normalize | all + + # Image encoder. + image_encoder_model: &image_encoder_model vgg16 + # Options: vgg16 | densenet121 | resnet152 | resnet50 + # image_encoder_output_size_val: &image_encoder_output_size_val 2048 + image_attention_multihead_size_val: &image_attention_multihead_size_val 2 + # image_attention_output_size_val: &image_attention_output_size_val 4096 + + # Question encoder. + question_encoder_embeddings: &question_encoder_embeddings glove.6B.100d.txt + # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled + question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 100 + question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128 + question_encoder_output_size_val: &question_encoder_output_size_val 200 + question_attention_multihead_size_val: &question_attention_multihead_size_val 4 + question_attention_output_size_val: &question_attention_output_size_val 800 + + # Fusion I: image + question + question_image_fusion_type_val: &question_image_fusion_type MultimodalFactorizedBilinearPooling + # Options: ElementWiseMultiplication | MultimodalFactorizedBilinearPooling (component: question_image_fusion) + question_image_fusion_size_val: &question_image_fusion_size_val 512 + + # Image size encoder. + image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10 + + # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val) + question_image_size_fusion_size_val: &question_image_size_fusion_size_val 522 + + # Final classifier: FFN. + answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100] + + batch_size: &batch_size 64 + +# Training parameters: +training: + problem: + batch_size: *batch_size + categories: all #C1,C2,C3 # TODO: all + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv + + # Optimizer parameters: + optimizer: + name: Adam + lr: 0.0001 + + # Terminal conditions: + terminal_conditions: + loss_stop: 1.0e-3 + episode_limit: 5000 #10000 + epoch_limit: -1 + +# Validation parameters: +validation: + problem: + batch_size: *batch_size + categories: all #C1,C2,C3 # TODO: all + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, fused_activation_size] + values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *question_image_fusion_size_val] + + # Statistics. + batch_size: + priority: 0.1 + type: BatchSizeStatistics + + # Answer encoding. + pipe1_all_answer_indexer: + priority: 0.2 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.all.word.mappings.csv # TODO: all + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: answers_ids + globals: + vocabulary_size: vocabulary_size_c1234_binary_yn # TODO: rename + word_mappings: word_mappings_c1234_binary_yn + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + priority: 1.1 + type: SentenceEmbeddings + embeddings_size: *question_encoder_embeddings_size_val + pretrained_embeddings_file: *question_encoder_embeddings + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + globals: + embeddings_size: pipe1_embeddings_size + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: GRU + hidden_size: *question_encoder_lstm_size_val + prediction_mode: Dense + initial_state: Trainable + use_logsoftmax: False + output_last_state: False + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: pipe1_embeddings_size + prediction_size: question_encoder_output_size + + + # Model 3: self attention for question. + question_attention: + priority: 1.3 + type: SelfAttention + latent_size: *question_encoder_lstm_size_val + num_attention_heads: 4 + streams: + question_encodings: question_activations + outputs: question_attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: question_attention_output_size + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model_type: *image_encoder_model + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + + image_attention: + priority: 2.2 + type: VQA_Attention + dropout_rate: 0.3 + latent_size: 256 + output_mode: 'Image' + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_attention_activations + outputs: image_attention_activations + globals: + question_encoding_size: question_attention_output_size + output_size: image_attention_output_size + + ################# PIPE 3: image-question fusion ################# + # MFB. + question_image_fusion: + priority: 3.1 + type: *question_image_fusion_type + dropout_rate: 0.5 + latent_size: 512 + pool_factor: 2 + streams: + image_encodings: image_attention_activations + question_encodings: question_attention_activations + outputs: fused_activations + globals: + image_encoding_size: image_attention_output_size + question_encoding_size: question_attention_output_size + # output_size: image_attention_output_size #fused_activation_size + + ################# PIPE 4: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + priority: 4.1 + type: FeedForwardNetwork + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 5: image-question-image size fusion ################# + + # 5th subpipeline: concatenation + concat: + priority: 5.1 + type: Concatenation + input_streams: [fused_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]] + output_dims: [-1,*question_image_size_fusion_size_val] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + ################# PIPE 6: C1 + C2 + C3 questions ################# + + # Model 4: FFN C123 answering + pipe6_c123_answer_classifier: + priority: 6.3 + type: FeedForwardNetwork + hidden: *answer_classifier_hidden_sizes_val + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe6_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c1234_binary_yn + + pipe6_c123_nllloss: + priority: 6.4 + type: NLLLoss + targets_dim: 1 + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + loss: pipe6_c123_loss + + pipe6_c123_precision_recall: + priority: 6.5 + type: PrecisionRecallStatistics + use_word_mappings: True + show_class_scores: True + #show_confusion_matrix: True + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + globals: + word_mappings: word_mappings_c123_binary_yn + statistics: + precision: pipe6_c123_precision + recall: pipe6_c123_recall + f1score: pipe6_c123_f1score + + # C123 Predictions decoder. + pipe5_c123_prediction_decoder: + priority: 6.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe6_c123_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_binary_yn + + ################# PIPE 7: MERGE ANSWERS ################# + + # Viewers. + viewer: + priority: 7.3 + type: StreamViewer + input_streams: + tokenized_questions, + category_names, pipe0_predicted_question_categories_names, + answers, predicted_answers