diff --git a/configs/default/components/models/vqa/attention.yml b/configs/default/components/models/vqa/attention.yml index 4018be1..4a65a76 100644 --- a/configs/default/components/models/vqa/attention.yml +++ b/configs/default/components/models/vqa/attention.yml @@ -14,6 +14,11 @@ latent_size: 100 # Number of attention heads (LOADED) num_attention_heads: 2 +# Type of output returned +# Options: Image | Fusion +# Details: attention-weighted image | concatenation of attention-weighted image and RNN encoded question +output_mode: Fusion + streams: #################################################################### @@ -42,7 +47,7 @@ globals: # Depth of the features tensor (RETRIEVED) feature_maps_depth: feature_maps_depth - + # Size of the question encodings input (RETRIEVED) question_encoding_size: question_encoding_size diff --git a/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml b/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml index c8869fd..1c55fe9 100644 --- a/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml +++ b/configs/default/components/models/vqa/multimodal_factorized_bilinear_pooling.yml @@ -40,13 +40,13 @@ globals: # Size of the question encodings input (RETRIEVED) question_encoding_size: question_encoding_size - # Size of the output (RETRIEVED) - output_size: output_size - #################################################################### # 4. Keymappings associated with GLOBAL variables that will be SET. #################################################################### + # Size of the output (SET) + output_size: output_size + #################################################################### # 5. Keymappings associated with statistics that will be ADDED. #################################################################### diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml new file mode 100644 index 0000000..cf8bd70 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml @@ -0,0 +1,133 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + problem: + batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + +validation: + problem: + batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + + +pipeline: + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, question_attention_activation_size, image_attention_activation_size, pooling_activation_size] + values: [200, 800, 4096, 512] + + ################# PIPE 0: question ################# + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 100 + pretrained_embeddings_file: glove.6B.100d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: GRU + prediction_mode: Dense + use_logsoftmax: False + output_last_state: False + initial_state: Trainable + dropout_rate: 0.1 + hidden_size: 128 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # Self Attention for question. + question_attention: + priority: 1.4 + type: SelfAttention + latent_size: 128 + num_attention_heads: 4 + streams: + question_encodings: question_activations + outputs: question_attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: question_attention_activation_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model_type: resnet50 + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + + image_attention: + priority: 2.2 + type: VQA_Attention + dropout_rate: 0.3 + latent_size: 1024 + output_mode: 'Image' + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_attention_activations + outputs: image_attention_activations + globals: + question_encoding_size: question_attention_activation_size + output_size: image_attention_activation_size + + ################# PIPE 3: image-question fusion ################# + # MFB + question_image_fusion: + priority: 3.1 + type: MultimodalFactorizedBilinearPooling + dropout_rate: 0.3 + latent_size: 512 + pool_factor: 2 + streams: + image_encodings: image_attention_activations + question_encodings: question_attention_activations + outputs: pooling_activations + globals: + image_encoding_size: image_attention_activation_size + question_encoding_size: question_attention_activation_size + output_size: pooling_activation_size + + classifier: + priority: 4.1 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.2 + streams: + inputs: pooling_activations + globals: + input_size: pooling_activation_size + prediction_size: vocabulary_size_c2 + + + #: pipeline diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml new file mode 100644 index 0000000..e7010be --- /dev/null +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml @@ -0,0 +1,299 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +hyperparameters: + # In here I am putting some of the hyperparameters from spreadsheet. + + question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize + # Accepted formats: a,b,c or [a,b,c] + # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all + + image_preprocessing: &image_preprocessing normalize + # Accepted formats: a,b,c or [a,b,c] + # none | random_affine | random_horizontal_flip | normalize | all + + # Image encoder. + image_encoder_model: &image_encoder_model resnet50 + # Options: vgg16 | densenet121 | resnet152 | resnet50 + # image_encoder_output_size_val: &image_encoder_output_size_val 2048 + image_attention_multihead_size_val: &image_attention_multihead_size_val 2 + # image_attention_output_size_val: &image_attention_output_size_val 4096 + + # Question encoder. + question_encoder_embeddings: &question_encoder_embeddings glove.6B.100d.txt + # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled + question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 100 + question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128 + question_encoder_output_size_val: &question_encoder_output_size_val 200 + question_attention_multihead_size_val: &question_attention_multihead_size_val 4 + question_attention_output_size_val: &question_attention_output_size_val 800 + + # Fusion I: image + question + question_image_fusion_type_val: &question_image_fusion_type MultimodalFactorizedBilinearPooling + # Options: ElementWiseMultiplication | MultimodalFactorizedBilinearPooling (component: question_image_fusion) + question_image_fusion_size_val: &question_image_fusion_size_val 512 + + # Image size encoder. + image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10 + + # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val) + question_image_size_fusion_size_val: &question_image_size_fusion_size_val 522 + + # Final classifier: FFN. + answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100] + + batch_size: &batch_size 64 + +# Training parameters: +training: + problem: + batch_size: *batch_size + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + + # Optimizer parameters: + optimizer: + name: Adam + lr: 0.0001 + + # Terminal conditions: + terminal_conditions: + loss_stop: 1.0e-3 + episode_limit: 2000 #10000 + epoch_limit: -1 + +# Validation parameters: +validation: + problem: + batch_size: *batch_size + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, fused_activation_size] + values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *question_image_fusion_size_val] + + # Statistics. + batch_size: + priority: 0.1 + type: BatchSizeStatistics + + # Answer encoding. + pipe1_all_answer_indexer: + priority: 0.2 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: answers_ids + globals: + vocabulary_size: vocabulary_size_c123_binary_yn + word_mappings: word_mappings_c123_binary_yn + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + priority: 1.1 + type: SentenceEmbeddings + embeddings_size: *question_encoder_embeddings_size_val + pretrained_embeddings_file: *question_encoder_embeddings + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + globals: + embeddings_size: pipe1_embeddings_size + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: GRU + hidden_size: *question_encoder_lstm_size_val + prediction_mode: Dense + initial_state: Trainable + use_logsoftmax: False + output_last_state: False + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: pipe1_embeddings_size + prediction_size: question_encoder_output_size + + + # Model 3: self attention for question. + question_attention: + priority: 1.3 + type: SelfAttention + latent_size: *question_encoder_lstm_size_val + num_attention_heads: 4 + streams: + question_encodings: question_activations + outputs: question_attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: question_attention_output_size + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model_type: *image_encoder_model + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + + image_attention: + priority: 2.2 + type: VQA_Attention + dropout_rate: 0.3 + latent_size: 256 + output_mode: 'Image' + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_attention_activations + outputs: image_attention_activations + globals: + question_encoding_size: question_attention_output_size + output_size: image_attention_output_size + + ################# PIPE 3: image-question fusion ################# + # MFB. + question_image_fusion: + priority: 3.1 + type: *question_image_fusion_type + dropout_rate: 0.5 + latent_size: 512 + pool_factor: 2 + streams: + image_encodings: image_attention_activations + question_encodings: question_attention_activations + outputs: fused_activations + globals: + image_encoding_size: image_attention_output_size + question_encoding_size: question_attention_output_size + # output_size: image_attention_output_size #fused_activation_size + + ################# PIPE 4: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + priority: 4.1 + type: FeedForwardNetwork + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 5: image-question-image size fusion ################# + + # 5th subpipeline: concatenation + concat: + priority: 5.1 + type: Concatenation + input_streams: [fused_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]] + output_dims: [-1,*question_image_size_fusion_size_val] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + ################# PIPE 6: C1 + C2 + C3 questions ################# + + # Model 4: FFN C123 answering + pipe6_c123_answer_classifier: + priority: 6.3 + type: FeedForwardNetwork + hidden: *answer_classifier_hidden_sizes_val + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe6_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_binary_yn + + pipe6_c123_nllloss: + priority: 6.4 + type: NLLLoss + targets_dim: 1 + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + loss: pipe6_c123_loss + + pipe6_c123_precision_recall: + priority: 6.5 + type: PrecisionRecallStatistics + use_word_mappings: True + show_class_scores: True + #show_confusion_matrix: True + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + globals: + word_mappings: word_mappings_c123_binary_yn + statistics: + precision: pipe6_c123_precision + recall: pipe6_c123_recall + f1score: pipe6_c123_f1score + + # C123 Predictions decoder. + pipe5_c123_prediction_decoder: + priority: 6.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe6_c123_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_binary_yn + + ################# PIPE 7: MERGE ANSWERS ################# + + # Viewers. + viewer: + priority: 7.3 + type: StreamViewer + input_streams: + tokenized_questions, + category_names, pipe0_predicted_question_categories_names, + answers, predicted_answers diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml new file mode 100644 index 0000000..5e7a511 --- /dev/null +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml @@ -0,0 +1,299 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +hyperparameters: + # In here I am putting some of the hyperparameters from spreadsheet. + + question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize + # Accepted formats: a,b,c or [a,b,c] + # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all + + image_preprocessing: &image_preprocessing normalize + # Accepted formats: a,b,c or [a,b,c] + # none | random_affine | random_horizontal_flip | normalize | all + + # Image encoder. + image_encoder_model: &image_encoder_model vgg16 + # Options: vgg16 | densenet121 | resnet152 | resnet50 + # image_encoder_output_size_val: &image_encoder_output_size_val 2048 + image_attention_multihead_size_val: &image_attention_multihead_size_val 2 + # image_attention_output_size_val: &image_attention_output_size_val 4096 + + # Question encoder. + question_encoder_embeddings: &question_encoder_embeddings glove.6B.100d.txt + # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled + question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 100 + question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128 + question_encoder_output_size_val: &question_encoder_output_size_val 200 + question_attention_multihead_size_val: &question_attention_multihead_size_val 4 + question_attention_output_size_val: &question_attention_output_size_val 800 + + # Fusion I: image + question + question_image_fusion_type_val: &question_image_fusion_type MultimodalFactorizedBilinearPooling + # Options: ElementWiseMultiplication | MultimodalFactorizedBilinearPooling (component: question_image_fusion) + question_image_fusion_size_val: &question_image_fusion_size_val 512 + + # Image size encoder. + image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10 + + # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val) + question_image_size_fusion_size_val: &question_image_size_fusion_size_val 522 + + # Final classifier: FFN. + answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100] + + batch_size: &batch_size 64 + +# Training parameters: +training: + problem: + batch_size: *batch_size + categories: all #C1,C2,C3 # TODO: all + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv + + # Optimizer parameters: + optimizer: + name: Adam + lr: 0.0001 + + # Terminal conditions: + terminal_conditions: + loss_stop: 1.0e-3 + episode_limit: 5000 #10000 + epoch_limit: -1 + +# Validation parameters: +validation: + problem: + batch_size: *batch_size + categories: all #C1,C2,C3 # TODO: all + # Appy all preprocessing/data augmentations. + question_preprocessing: *question_preprocessing + image_preprocessing: *image_preprocessing + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, fused_activation_size] + values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *question_image_fusion_size_val] + + # Statistics. + batch_size: + priority: 0.1 + type: BatchSizeStatistics + + # Answer encoding. + pipe1_all_answer_indexer: + priority: 0.2 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.all.word.mappings.csv # TODO: all + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: answers_ids + globals: + vocabulary_size: vocabulary_size_c1234_binary_yn # TODO: rename + word_mappings: word_mappings_c1234_binary_yn + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + priority: 1.1 + type: SentenceEmbeddings + embeddings_size: *question_encoder_embeddings_size_val + pretrained_embeddings_file: *question_encoder_embeddings + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + globals: + embeddings_size: pipe1_embeddings_size + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: GRU + hidden_size: *question_encoder_lstm_size_val + prediction_mode: Dense + initial_state: Trainable + use_logsoftmax: False + output_last_state: False + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: pipe1_embeddings_size + prediction_size: question_encoder_output_size + + + # Model 3: self attention for question. + question_attention: + priority: 1.3 + type: SelfAttention + latent_size: *question_encoder_lstm_size_val + num_attention_heads: 4 + streams: + question_encodings: question_activations + outputs: question_attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: question_attention_output_size + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model_type: *image_encoder_model + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + + image_attention: + priority: 2.2 + type: VQA_Attention + dropout_rate: 0.3 + latent_size: 256 + output_mode: 'Image' + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_attention_activations + outputs: image_attention_activations + globals: + question_encoding_size: question_attention_output_size + output_size: image_attention_output_size + + ################# PIPE 3: image-question fusion ################# + # MFB. + question_image_fusion: + priority: 3.1 + type: *question_image_fusion_type + dropout_rate: 0.5 + latent_size: 512 + pool_factor: 2 + streams: + image_encodings: image_attention_activations + question_encodings: question_attention_activations + outputs: fused_activations + globals: + image_encoding_size: image_attention_output_size + question_encoding_size: question_attention_output_size + # output_size: image_attention_output_size #fused_activation_size + + ################# PIPE 4: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + priority: 4.1 + type: FeedForwardNetwork + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 5: image-question-image size fusion ################# + + # 5th subpipeline: concatenation + concat: + priority: 5.1 + type: Concatenation + input_streams: [fused_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]] + output_dims: [-1,*question_image_size_fusion_size_val] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + ################# PIPE 6: C1 + C2 + C3 questions ################# + + # Model 4: FFN C123 answering + pipe6_c123_answer_classifier: + priority: 6.3 + type: FeedForwardNetwork + hidden: *answer_classifier_hidden_sizes_val + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe6_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c1234_binary_yn + + pipe6_c123_nllloss: + priority: 6.4 + type: NLLLoss + targets_dim: 1 + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + loss: pipe6_c123_loss + + pipe6_c123_precision_recall: + priority: 6.5 + type: PrecisionRecallStatistics + use_word_mappings: True + show_class_scores: True + #show_confusion_matrix: True + streams: + predictions: pipe6_c123_predictions + targets: answers_ids + globals: + word_mappings: word_mappings_c123_binary_yn + statistics: + precision: pipe6_c123_precision + recall: pipe6_c123_recall + f1score: pipe6_c123_f1score + + # C123 Predictions decoder. + pipe5_c123_prediction_decoder: + priority: 6.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe6_c123_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_binary_yn + + ################# PIPE 7: MERGE ANSWERS ################# + + # Viewers. + viewer: + priority: 7.3 + type: StreamViewer + input_streams: + tokenized_questions, + category_names, pipe0_predicted_question_categories_names, + answers, predicted_answers diff --git a/ptp/components/models/vqa/attention.py b/ptp/components/models/vqa/attention.py index 88c868f..4a04b66 100644 --- a/ptp/components/models/vqa/attention.py +++ b/ptp/components/models/vqa/attention.py @@ -60,8 +60,17 @@ def __init__(self, name, config): self.latent_size = self.config["latent_size"] self.num_attention_heads = self.config["num_attention_heads"] + # Output new attention weighted image encoding only, or both image and question image_encodings + self.output_mode = self.config["output_mode"] + # Output feature size - self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size + if(self.output_mode == 'Image'): + self.output_size = self.feature_maps_depth*self.num_attention_heads + elif(self.output_mode == 'Fusion'): + self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size + else: + print("'output_mode' unspecified for VQA Attention in config") #TODO: find a better way to report corner case issue + # Export to globals. self.globals["output_size"] = self.output_size @@ -137,8 +146,12 @@ def forward(self, data_dict): attention_enc_img = apply_attention(enc_img, attention) # [48, 2048, 7, 7], [48, 2, 7, 7] # print("attention im", attention_enc_img.shape) - # Fusion -- Concatenate attention-weighted image encodings and question encodings. - outputs = torch.cat([attention_enc_img, latent_q], dim=1) + if(self.output_mode == 'Image'): + # Output attention-weighted image encodings + outputs = attention_enc_img + elif(self.output_mode == 'None'): + # Fusion -- Concatenate attention-weighted image encodings and question encodings. + outputs = torch.cat([attention_enc_img, latent_q], dim=1) # print("outputs", outputs.shape) # Add predictions to datadict. data_dict.extend({self.key_outputs: outputs}) @@ -165,4 +178,4 @@ def apply_attention(input, attention): attention = torch.nn.functional.softmax(attention, dim=-1).unsqueeze(2) # [n, g, 1, s] [batch, multi_head, 1, height*width] [48, 2, 1, 7*7] weighted = attention * input # [n, g, c, s] [48, 2, 2048, 7*7] weighted_mean = weighted.sum(dim=-1) # [n, g, c] [48, 2, 2048] - return weighted_mean.view(n, -1) # [48, 4196] + return weighted_mean.view(n, -1) # [48, 4096] diff --git a/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py index f09b374..e9fcc33 100644 --- a/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py +++ b/ptp/components/models/vqa/multimodal_factorized_bilinear_pooling.py @@ -59,6 +59,9 @@ def __init__(self, name, config): # Output feature size self.output_size = self.latent_size + # Export to globals. + self.globals["output_size"] = self.output_size + # Map image and question encodings to a common latent space of dimension 'latent_size'. self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.latent_size*self.factor) self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.latent_size*self.factor) @@ -113,6 +116,7 @@ def forward(self, data_dict): # Element-wise mutliplication of image and question encodings enc_z = latent_img * latent_q # [48, 512] + # Dropout regularization enc_z = self.dropout(enc_z) enc_z = enc_z.view(enc_z.size(0), self.latent_size, self.factor) # [48, 256, 2] diff --git a/ptp/components/models/vqa/self_attention.py b/ptp/components/models/vqa/self_attention.py index 01adc18..080f9a9 100644 --- a/ptp/components/models/vqa/self_attention.py +++ b/ptp/components/models/vqa/self_attention.py @@ -104,14 +104,12 @@ def forward(self, data_dict): self.Attention = torch.softmax(self.W2(self.activation(self.W1(input_enc))), dim = 1) # [48, 8, 4] [batch, num_words, num_heads] # Multiply attention weights with question encoding - input_enc_weighted = torch.matmul(self.Attention.transpose(1,2),input_enc) - print("input_enc_weighted", input_enc_weighted.shape) + input_enc_weighted = torch.matmul(self.Attention.transpose(1,2),input_enc) # [48, 4, 100] [batch, num_heads, embed_dim] # Concatenate features from multi-head attention - outputs = input_enc_weighted.view(batch_size, -1) + outputs = input_enc_weighted.view(batch_size, -1) # [48, 400] [batch, num_heads*embed_dim] # # Alternatively: combine multi-head attention using a mean or sum operation # outputs = torch.sum(input_enc_weighted,1)/self.num_attention_heads - print("outputs", outputs.shape) # Add predictions to datadict. data_dict.extend({self.key_outputs: outputs})