From 078d5c0b054ab46fe6706ccf3e9614de531fa6db Mon Sep 17 00:00:00 2001 From: tkornut Date: Fri, 19 Apr 2019 16:30:28 -0700 Subject: [PATCH 01/11] c2: 100000 --- .../c2_classification_all_rnn_vgg16_concat.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml index d4745b6..0999e73 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml @@ -1,6 +1,14 @@ # Load config defining problems for training, validation and testing. default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml +training: + # settings parameters + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 100000 + epoch_limit: -1 + + pipeline: name: vqa_med_c2_classification_all_rnn_vgg_concat From 71b7ff972fc162a39448877af314bc526e81f0dd Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 16:59:09 -0700 Subject: [PATCH 02/11] element wise multiplication component + pipeline for c2 --- .../models/element_wise_multiplication.yml | 46 +++++++ .../c2_classification_all_rnn_vgg16_ewm.yml | 98 ++++++++++++++ ptp/components/models/__init__.py | 2 + .../models/element_wise_multiplication.py | 122 ++++++++++++++++++ 4 files changed, 268 insertions(+) create mode 100644 configs/default/components/models/element_wise_multiplication.yml create mode 100644 configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml create mode 100644 ptp/components/models/element_wise_multiplication.py diff --git a/configs/default/components/models/element_wise_multiplication.yml b/configs/default/components/models/element_wise_multiplication.yml new file mode 100644 index 0000000..f0f02d8 --- /dev/null +++ b/configs/default/components/models/element_wise_multiplication.yml @@ -0,0 +1,46 @@ +# This file defines the default values for the ElementWiseMultiplication model. + +#################################################################### +# 1. CONFIGURATION PARAMETERS that will be LOADED by the component. +#################################################################### + +# Dropout rate (LOADED) +# Default: 0 (means that it is turned off) +dropout_rate: 0 + +streams: + #################################################################### + # 2. Keymappings associated with INPUT and OUTPUT streams. + #################################################################### + + # Stream containing batch of encoded images (INPUT) + image_encodings: image_encodings + + # Stream containing batch of encoded questions (INPUT) + question_encodings: question_encodings + + # Stream containing outputs (OUTPUT) + outputs: outputs + +globals: + #################################################################### + # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. + #################################################################### + + # Size of the image encodings input (RETRIEVED) + image_encoding_size: image_encoding_size + + # Size of the question encodings input (RETRIEVED) + question_encoding_size: question_encoding_size + + # Size of the output (RETRIEVED) + output_size: output_size + + #################################################################### + # 4. Keymappings associated with GLOBAL variables that will be SET. + #################################################################### + + #################################################################### + # 5. Keymappings associated with statistics that will be ADDED. + #################################################################### + diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml new file mode 100644 index 0000000..8614f78 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml @@ -0,0 +1,98 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + # settings parameters + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 10000 + epoch_limit: -1 + + +pipeline: + name: c2_classification_all_rnn_vgg16_ewm + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size] + values: [100, 100, 100] + + # First subpipeline: question. + # Questions encoding. + question_tokenizer: + priority: 1.1 + type: SentenceTokenizer + streams: + inputs: questions + outputs: tokenized_questions + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + use_logsoftmax: False + initial_state_trainable: False + #num_layers: 5 + hidden_size: 50 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # 3rd subpipeline: image. + # Image encoder. + image_encoder: + priority: 3.1 + type: TorchVisionWrapper + streams: + inputs: images + predictions: image_activations + globals: + prediction_size: image_encoder_output_size + + # 4th subpipeline: element wise multiplication + FF. + question_image_fusion: + priority: 4.1 + type: ElementWiseMultiplication + dropout: 0.5 + streams: + image_encodings: image_activations + question_encodings: question_activations + outputs: element_wise_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: element_wise_activation_size + + + classifier: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout: 0.5 + streams: + inputs: element_wise_activations + globals: + input_size: element_wise_activation_size + prediction_size: vocabulary_size_c2 + + + #: pipeline diff --git a/ptp/components/models/__init__.py b/ptp/components/models/__init__.py index 32e95b0..97ab98b 100644 --- a/ptp/components/models/__init__.py +++ b/ptp/components/models/__init__.py @@ -1,4 +1,5 @@ from .convnet_encoder import ConvNetEncoder +from .element_wise_multiplication import ElementWiseMultiplication from .feed_forward_network import FeedForwardNetwork from .index_embeddings import IndexEmbeddings from .torch_vision_wrapper import TorchVisionWrapper @@ -9,6 +10,7 @@ __all__ = [ 'ConvNetEncoder', + 'ElementWiseMultiplication', 'FeedForwardNetwork', 'IndexEmbeddings', 'TorchVisionWrapper', diff --git a/ptp/components/models/element_wise_multiplication.py b/ptp/components/models/element_wise_multiplication.py new file mode 100644 index 0000000..a5c88c8 --- /dev/null +++ b/ptp/components/models/element_wise_multiplication.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (C) IBM Corporation 2018 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Tomasz Kornuta" + + +import torch + +from ptp.components.models.model import Model +from ptp.data_types.data_definition import DataDefinition + + +class ElementWiseMultiplication(Model): + """ + Element of one of classical baselines for Visual Question Answering. + The model inputs (question and image encodings) are fused via element-wise multiplication and returned (for subsequent classification, done in a separate component e.g. ffn). + + On the basis of: Jiasen Lu and Xiao Lin and Dhruv Batra and Devi Parikh. "Deeper LSTM and normalized CNN visual question answering model" (2015). + """ + def __init__(self, name, config): + """ + Initializes the model, creates the required layers. + + :param name: Name of the model (taken from the configuration file). + + :param config: Parameters read from configuration file. + :type config: ``ptp.configuration.ConfigInterface`` + + """ + super(ElementWiseMultiplication, self).__init__(name, ElementWiseMultiplication, config) + + # Get key mappings. + self.key_image_encodings = self.stream_keys["image_encodings"] + self.key_question_encodings = self.stream_keys["question_encodings"] + self.key_outputs = self.stream_keys["outputs"] + + # Retrieve input/output sizes from globals. + self.image_encoding_size = self.globals["image_encoding_size"] + self.question_encoding_size = self.globals["question_encoding_size"] + self.output_size = self.globals["output_size"] + + # Create the model. + self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.output_size) + self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.output_size) + + # Create activation layer. + self.activation = torch.nn.ReLU() + + # Retrieve dropout rate value - if set, will put dropout between every layer. + dropout_rate = self.config["dropout_rate"] + + # Create dropout layer. + self.dropout = torch.nn.Dropout(dropout_rate) + + + + + def input_data_definitions(self): + """ + Function returns a dictionary with definitions of input data that are required by the component. + + :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_image_encodings: DataDefinition([-1, self.image_encoding_size], [torch.Tensor], "Batch of encoded images [BATCH_SIZE x IMAGE_ENCODING_SIZE]"), + self.key_question_encodings: DataDefinition([-1, self.question_encoding_size], [torch.Tensor], "Batch of encoded questions [BATCH_SIZE x QUESTION_ENCODING_SIZE]"), + } + + + def output_data_definitions(self): + """ + Function returns a dictionary with definitions of output data produced the component. + + :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs [BATCH_SIZE x OUTPUT_SIZE]") + } + + def forward(self, data_dict): + """ + Main forward pass of the model. + + :param data_dict: DataDict({'images',**}) + :type data_dict: ``ptp.dadatypes.DataDict`` + """ + + # Unpack DataDict. + enc_img = data_dict[self.key_image_encodings] + enc_q = data_dict[self.key_question_encodings] + + # Apply nonlinearities and dropout on images. + enc_img = self.activation(enc_img) + enc_img = self.dropout(enc_img) + + # Apply nonlinearities and dropout on questions. + enc_q = self.activation(enc_q) + enc_q = self.dropout(enc_q) + + # Pass inputs layers mapping them to the same "latent space". + latent_img = self.image_encodings_ff(enc_img) + latent_q = self.question_encodings_ff(enc_q) + + # Element wise multiplication. + outputs = latent_img * latent_q + + # Add predictions to datadict. + data_dict.extend({self.key_outputs: outputs}) From 0680805f5691ee2a2fa831ded1137debabd1bb1c Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 17:14:26 -0700 Subject: [PATCH 03/11] c2: pipeline with question-image element-wise multiplication and late fusion concat with image-size --- .../c2_classification_all_rnn_vgg16_ewm.yml | 10 +- ..._classification_all_rnn_vgg16_ewm_size.yml | 137 ++++++++++++++++++ 2 files changed, 141 insertions(+), 6 deletions(-) create mode 100644 configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml index 8614f78..1fa84d0 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml @@ -19,7 +19,7 @@ pipeline: keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size] values: [100, 100, 100] - # First subpipeline: question. + ################# PIPE 0: question ################# # Questions encoding. question_tokenizer: priority: 1.1 @@ -48,7 +48,6 @@ pipeline: prediction_mode: Last use_logsoftmax: False initial_state_trainable: False - #num_layers: 5 hidden_size: 50 streams: inputs: embedded_questions @@ -57,7 +56,7 @@ pipeline: input_size: embeddings_size prediction_size: question_encoder_output_size - # 3rd subpipeline: image. + ################# PIPE 2: image ################# # Image encoder. image_encoder: priority: 3.1 @@ -68,7 +67,8 @@ pipeline: globals: prediction_size: image_encoder_output_size - # 4th subpipeline: element wise multiplication + FF. + ################# PIPE 3: fusion + classification ################# + # Element wise multiplication + FF. question_image_fusion: priority: 4.1 type: ElementWiseMultiplication @@ -82,7 +82,6 @@ pipeline: question_encoding_size: question_encoder_output_size output_size: element_wise_activation_size - classifier: priority: 4.2 type: FeedForwardNetwork @@ -94,5 +93,4 @@ pipeline: input_size: element_wise_activation_size prediction_size: vocabulary_size_c2 - #: pipeline diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml new file mode 100644 index 0000000..ab908a5 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml @@ -0,0 +1,137 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + # settings parameters + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 10000 + epoch_limit: -1 + + +pipeline: + name: c2_classification_all_rnn_vgg16_ewm_size + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size,image_size_encoder_input_size, image_size_encoder_output_size] + values: [100, 100, 100, 2, 10] + + ################# PIPE 0: question ################# + # Questions encoding. + question_tokenizer: + priority: 1.1 + type: SentenceTokenizer + streams: + inputs: questions + outputs: tokenized_questions + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + use_logsoftmax: False + initial_state_trainable: False + hidden_size: 50 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 3.1 + type: TorchVisionWrapper + streams: + inputs: images + predictions: image_activations + globals: + prediction_size: image_encoder_output_size + + ################# PIPE 3: image-question fusion ################# + # Element wise multiplication + FF. + question_image_fusion: + priority: 4.1 + type: ElementWiseMultiplication + dropout: 0.5 + streams: + image_encodings: image_activations + question_encodings: question_activations + outputs: element_wise_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: element_wise_activation_size + + question_image_ffn: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout: 0.5 + streams: + inputs: element_wise_activations + predictions: question_image_activations + globals: + input_size: element_wise_activation_size + prediction_size: element_wise_activation_size + + ################# PIPE 4: image-question-image size fusion + classification ################# + # 2nd subpipeline: image size. + # Model - image size classifier. + image_size_encoder: + priority: 5.1 + type: FeedForwardNetwork + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + # 4th subpipeline: concatenation + FF. + concat: + priority: 5.2 + type: Concatenation + input_streams: [question_image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,10]] + output_dims: [-1,110] + streams: + outputs: concatenated_activations + globals: + output_size: concatentated_activations_size + + + classifier: + priority: 5.3 + type: FeedForwardNetwork + hidden_sizes: [110] + dropout: 0.5 + streams: + inputs: concatenated_activations + globals: + input_size: concatentated_activations_size + prediction_size: vocabulary_size_c2 + + + #: pipeline From 9a4e958d02ca480a1820e3bf39f3597a01591eaa Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 17:30:32 -0700 Subject: [PATCH 04/11] c2: 128 batch size --- .../c2_classification_all_rnn_vgg16_ewm_size.yml | 8 -------- .../c2_classification/default_c2_classification.yml | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml index ab908a5..e867c17 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml @@ -1,14 +1,6 @@ # Load config defining problems for training, validation and testing. default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml -training: - # settings parameters - terminal_conditions: - loss_stop: 1.0e-2 - episode_limit: 10000 - epoch_limit: -1 - - pipeline: name: c2_classification_all_rnn_vgg16_ewm_size diff --git a/configs/vqa_med_2019/c2_classification/default_c2_classification.yml b/configs/vqa_med_2019/c2_classification/default_c2_classification.yml index 3df45b4..f88e328 100644 --- a/configs/vqa_med_2019/c2_classification/default_c2_classification.yml +++ b/configs/vqa_med_2019/c2_classification/default_c2_classification.yml @@ -4,16 +4,24 @@ default_configs: vqa_med_2019/default_vqa_med_2019.yml # Training parameters: training: problem: + batch_size: 128 categories: C2 sampler: name: WeightedRandomSampler weights: ~/data/vqa-med/answers.c2.weights.csv dataloader: num_workers: 4 + # Termination. + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 10000 + epoch_limit: -1 + # Validation parameters: validation: problem: + batch_size: 128 categories: C2 dataloader: num_workers: 4 From 957ccf7ec7baf4d7b5ba061a14ae465d98ec9eb1 Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 18:55:38 -0700 Subject: [PATCH 05/11] Refactored Wrapper, added option to return feature maps (VGG16 only) --- .../models/torch_vision_wrapper.yml | 27 ++++++++-- ...c1_classification_all_bow_vgg16_concat.yml | 4 +- ...c1_classification_all_rnn_vgg16_concat.yml | 4 +- ...c2_classification_all_rnn_vgg16_concat.yml | 4 +- .../c2_classification_all_rnn_vgg16_ewm.yml | 4 +- ..._classification_all_rnn_vgg16_ewm_size.yml | 4 +- ...c3_classification_all_bow_vgg16_concat.yml | 4 +- ...c3_classification_all_rnn_vgg16_concat.yml | 4 +- .../c3_classification_image_vgg16_softmax.yml | 4 +- ...nn_shared_all_encoders_two_ffns_losses.yml | 4 +- ...t_rnn_shared_all_encoders_one_ffn_loss.yml | 4 +- ...n_shared_all_encoders_four_ffns_losses.yml | 4 +- ..._shared_all_encoders_three_ffns_losses.yml | 4 +- ptp/components/models/convnet_encoder.py | 2 +- ptp/components/models/torch_vision_wrapper.py | 50 ++++++++++++++----- 15 files changed, 84 insertions(+), 43 deletions(-) diff --git a/configs/default/components/models/torch_vision_wrapper.yml b/configs/default/components/models/torch_vision_wrapper.yml index 28db8d8..968eb9c 100644 --- a/configs/default/components/models/torch_vision_wrapper.yml +++ b/configs/default/components/models/torch_vision_wrapper.yml @@ -1,4 +1,4 @@ -# This file defines the default values for the LeNet5 model. +# This file defines the default values for the component wrapping (pretrained) Torch Vision models. #################################################################### # 1. CONFIGURATION PARAMETERS that will be LOADED by the component. @@ -8,6 +8,10 @@ #model_type: VGG16 # HARDCODED FOR NOW! +# Parameter denoting whether the component will return (flat) prediction +# or output of last feature layer (LOADED) +return_feature_maps: False + streams: #################################################################### # 2. Keymappings associated with INPUT and OUTPUT streams. @@ -16,21 +20,34 @@ streams: # Stream containing batch of images (INPUT) inputs: inputs - # Stream containing predictions (OUTPUT) - predictions: predictions + # Stream containing outputs (features or "predictions") (OUTPUT) + outputs: outputs globals: #################################################################### # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. #################################################################### - # Size of the prediction (RETRIEVED) - prediction_size: prediction_size + # Size of the output (RETRIEVED) + # Used when return_features = False. + output_size: output_size #################################################################### # 4. Keymappings associated with GLOBAL variables that will be SET. #################################################################### + # Height of the returned features tensor (SET) + # Used when return_features = True. + feature_maps_height: feature_maps_height + + # Width of the returned features tensor (SET) + # Used when return_features = True. + feature_maps_width: feature_maps_width + + # Depth of the returned features tensor (SET) + # Used when return_features = True. + feature_maps_depth: feature_maps_depth + #################################################################### # 5. Keymappings associated with statistics that will be ADDED. #################################################################### diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml b/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml index b2e6ce1..a38067e 100644 --- a/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml +++ b/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml @@ -61,9 +61,9 @@ pipeline: priority: 3.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size # 4th subpipeline: concatenation + FF. concat: diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml index 62b4389..f8942cc 100644 --- a/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml @@ -68,9 +68,9 @@ pipeline: priority: 3.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size # 4th subpipeline: concatenation + FF. concat: diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml index 0999e73..fa83133 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml @@ -76,9 +76,9 @@ pipeline: priority: 3.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size # 4th subpipeline: concatenation + FF. concat: diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml index 1fa84d0..af37b20 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml @@ -63,9 +63,9 @@ pipeline: type: TorchVisionWrapper streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size ################# PIPE 3: fusion + classification ################# # Element wise multiplication + FF. diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml index e867c17..47a780c 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml @@ -55,9 +55,9 @@ pipeline: type: TorchVisionWrapper streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size ################# PIPE 3: image-question fusion ################# # Element wise multiplication + FF. diff --git a/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml b/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml index ed3ed6a..0a02d41 100644 --- a/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml +++ b/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml @@ -61,9 +61,9 @@ pipeline: priority: 3.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size # 4th subpipeline: concatenation + FF. concat: diff --git a/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml index 51b30c6..3a634fb 100644 --- a/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml @@ -68,9 +68,9 @@ pipeline: priority: 3.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size # 4th subpipeline: concatenation + FF. concat: diff --git a/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml b/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml index 9bc9412..1882aca 100644 --- a/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml +++ b/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml @@ -10,9 +10,9 @@ pipeline: priority: 1.1 streams: inputs: images - predictions: vgg_images + outputs: vgg_images globals: - prediction_size: vocabulary_size_c3 + output_size: vocabulary_size_c3 # Model - softmax classifier. classifier: diff --git a/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml b/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml index 6996f91..1d31f30 100644 --- a/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml +++ b/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml @@ -182,9 +182,9 @@ pipeline: priority: 2.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# diff --git a/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml b/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml index ef8f535..4a5a179 100644 --- a/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml +++ b/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml @@ -182,9 +182,9 @@ pipeline: priority: 2.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# diff --git a/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml b/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml index 56ab04b..8b7b50a 100644 --- a/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml +++ b/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml @@ -182,9 +182,9 @@ pipeline: priority: 2.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# diff --git a/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml b/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml index 3b1d952..105f2e3 100644 --- a/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml +++ b/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml @@ -182,9 +182,9 @@ pipeline: priority: 2.1 streams: inputs: images - predictions: image_activations + outputs: image_activations globals: - prediction_size: image_encoder_output_size + output_size: image_encoder_output_size ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# diff --git a/ptp/components/models/convnet_encoder.py b/ptp/components/models/convnet_encoder.py index 288f0cd..788ef5f 100644 --- a/ptp/components/models/convnet_encoder.py +++ b/ptp/components/models/convnet_encoder.py @@ -224,7 +224,7 @@ def output_data_definitions(self): :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). """ return { - self.key_feature_maps: DataDefinition([-1, self.out_channels_conv3, self.height_features_maxpool3, self.width_features_maxpool3], [torch.Tensor], "Batch of filter maps [BATCH_SIZE x DEPTH x HEIGHT x WIDTH]") + self.key_feature_maps: DataDefinition([-1, self.out_channels_conv3, self.height_features_maxpool3, self.width_features_maxpool3], [torch.Tensor], "Batch of filter maps [BATCH_SIZE x FEAT_DEPTH x FEAT_HEIGHT x FEAT_WIDTH]") } def forward(self, data_dict): diff --git a/ptp/components/models/torch_vision_wrapper.py b/ptp/components/models/torch_vision_wrapper.py index 13692f1..4c5027c 100644 --- a/ptp/components/models/torch_vision_wrapper.py +++ b/ptp/components/models/torch_vision_wrapper.py @@ -43,15 +43,34 @@ def __init__(self, name, config): # Get key mappings. self.key_inputs = self.stream_keys["inputs"] - self.key_predictions = self.stream_keys["predictions"] - - # Retrieve prediction size from globals. - self.prediction_size = self.globals["prediction_size"] + self.key_outputs = self.stream_keys["outputs"] # Get VGG16 self.model = models.vgg16(pretrained=True) - # "Replace" last layer. - self.model.classifier._modules['6'] = torch.nn.Linear(4096, self.prediction_size) + + # Check operation mode. + self.return_feature_maps = self.config["return_feature_maps"] + + if self.return_feature_maps: + # Use only the "feature encoder". + self.model = self.model.features + + # Height of the returned features tensor (SET) + self.feature_maps_height = 7 + self.globals["feature_maps_height"] = self.feature_maps_height + # Width of the returned features tensor (SET) + self.feature_maps_width = 7 + self.globals["feature_maps_width"] = self.feature_maps_width + # Depth of the returned features tensor (SET) + self.feature_maps_depth = 512 + self.globals["feature_maps_depth"] = self.feature_maps_depth + + else: + # Use the whole model, but cut/reshape only the last layer. + # Retrieve prediction size from globals. + self.output_size = self.globals["output_size"] + # "Replace" last layer. + self.model.classifier._modules['6'] = torch.nn.Linear(4096, self.output_size) def input_data_definitions(self): @@ -71,9 +90,14 @@ def output_data_definitions(self): :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). """ - return { - self.key_predictions: DataDefinition([-1, self.prediction_size], [torch.Tensor], "Batch of predictions, each represented as probability distribution over classes [BATCH_SIZE x PREDICTION_SIZE]") - } + if self.return_feature_maps: + return { + self.key_outputs: DataDefinition([-1, self.feature_maps_depth, self.feature_maps_height, self.feature_maps_width], [torch.Tensor], "Batch of feature maps [BATCH_SIZE x FEAT_DEPTH x FEAT_HEIGHT x FEAT_WIDTH]") + } + else: + return { + self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs, each represented as probability distribution over classes [BATCH_SIZE x PREDICTION_SIZE]") + } def forward(self, data_dict): """ @@ -82,7 +106,7 @@ def forward(self, data_dict): :param data_dict: DataDict({'inputs', ....}), where: - inputs: expected stream containing images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE WIDTH] - - outpus: added stream containing predictions [BATCH_SIZE x PREDICTION_SIZE] + - outpus: added stream containing outputs [BATCH_SIZE x PREDICTION_SIZE] :type data_dict: ``ptp.data_types.DataDict`` @@ -91,7 +115,7 @@ def forward(self, data_dict): # Unpack DataDict. img = data_dict[self.key_inputs] - predictions = self.model(img) + outputs = self.model(img) - # Add predictions to datadict. - data_dict.extend({self.key_predictions: predictions}) + # Add outputs to datadict. + data_dict.extend({self.key_outputs: outputs}) From 3a8f95a34adc0b60b25260e4ec073e6e1d2844ee Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 19:22:26 -0700 Subject: [PATCH 06/11] c2: dropout_rate: 0.5 --- .../c2_classification_all_rnn_vgg16_concat.yml | 13 +++++++------ .../c2_classification_all_rnn_vgg16_ewm.yml | 4 ++-- .../c2_classification_all_rnn_vgg16_ewm_size.yml | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml index fa83133..672a638 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml @@ -13,8 +13,8 @@ pipeline: name: vqa_med_c2_classification_all_rnn_vgg_concat global_publisher: - type: GlobalVariablePublisher priority: 0 + type: GlobalVariablePublisher # Add input_size to globals. keys: [question_embeddings_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size] values: [100, 2, 10, 100] @@ -30,8 +30,8 @@ pipeline: # Model 1: Embeddings question_embeddings: - type: SentenceEmbeddings priority: 1.2 + type: SentenceEmbeddings embeddings_size: 50 pretrained_embeddings_file: glove.6B.50d.txt data_folder: ~/data/vqa-med @@ -42,10 +42,10 @@ pipeline: # Model 2: RNN question_lstm: + priority: 1.3 type: RecurrentNeuralNetwork cell_type: LSTM prediction_mode: Last - priority: 1.3 use_logsoftmax: False initial_state_trainable: False #num_layers: 5 @@ -72,8 +72,8 @@ pipeline: # 3rd subpipeline: image. # Image encoder. image_encoder: - type: TorchVisionWrapper priority: 3.1 + type: TorchVisionWrapper streams: inputs: images outputs: image_activations @@ -82,8 +82,8 @@ pipeline: # 4th subpipeline: concatenation + FF. concat: - type: Concatenation priority: 4.1 + type: Concatenation input_streams: [question_activations,image_size_activations,image_activations] # Concatenation dim: 1 # default @@ -96,9 +96,10 @@ pipeline: classifier: + priority: 4.2 type: FeedForwardNetwork hidden_sizes: [100] - priority: 4.2 + dropout_rate: 0.5 streams: inputs: concatenated_activations globals: diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml index af37b20..4208a7d 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml @@ -72,7 +72,7 @@ pipeline: question_image_fusion: priority: 4.1 type: ElementWiseMultiplication - dropout: 0.5 + dropout_rate: 0.5 streams: image_encodings: image_activations question_encodings: question_activations @@ -86,7 +86,7 @@ pipeline: priority: 4.2 type: FeedForwardNetwork hidden_sizes: [100] - dropout: 0.5 + dropout_rate: 0.5 streams: inputs: element_wise_activations globals: diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml index 47a780c..91ff5d1 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml @@ -64,7 +64,7 @@ pipeline: question_image_fusion: priority: 4.1 type: ElementWiseMultiplication - dropout: 0.5 + dropout_rate: 0.5 streams: image_encodings: image_activations question_encodings: question_activations @@ -78,7 +78,7 @@ pipeline: priority: 4.2 type: FeedForwardNetwork hidden_sizes: [100] - dropout: 0.5 + dropout_rate: 0.5 streams: inputs: element_wise_activations predictions: question_image_activations @@ -118,7 +118,7 @@ pipeline: priority: 5.3 type: FeedForwardNetwork hidden_sizes: [110] - dropout: 0.5 + dropout_rate: 0.5 streams: inputs: concatenated_activations globals: From 5d81c5a06704c8423e4da2fc6629d0c09c3ce205 Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 21:41:15 -0700 Subject: [PATCH 07/11] Multimodal Compact Bilinear Pooling + pipeline --- .../{ => vqa}/element_wise_multiplication.yml | 0 .../multimodal_compact_bilinear_pooling.yml | 42 +++++ ...c2_classification_all_rnn_vgg16_concat.yml | 8 - .../c2_classification_all_rnn_vgg16_ewm.yml | 8 - .../c2_classification_all_rnn_vgg16_mcb.yml | 96 ++++++++++ ptp/components/models/__init__.py | 7 +- .../{ => vqa}/element_wise_multiplication.py | 0 .../multimodal_compact_bilinear_pooling.py | 169 ++++++++++++++++++ 8 files changed, 312 insertions(+), 18 deletions(-) rename configs/default/components/models/{ => vqa}/element_wise_multiplication.yml (100%) create mode 100644 configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml create mode 100644 configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml rename ptp/components/models/{ => vqa}/element_wise_multiplication.py (100%) create mode 100644 ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py diff --git a/configs/default/components/models/element_wise_multiplication.yml b/configs/default/components/models/vqa/element_wise_multiplication.yml similarity index 100% rename from configs/default/components/models/element_wise_multiplication.yml rename to configs/default/components/models/vqa/element_wise_multiplication.yml diff --git a/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml b/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml new file mode 100644 index 0000000..3d8a98a --- /dev/null +++ b/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml @@ -0,0 +1,42 @@ +# This file defines the default values for the Multimodal Compact Bilinear Pooling model. + +#################################################################### +# 1. CONFIGURATION PARAMETERS that will be LOADED by the component. +#################################################################### + +streams: + #################################################################### + # 2. Keymappings associated with INPUT and OUTPUT streams. + #################################################################### + + # Stream containing batch of encoded images (INPUT) + image_encodings: image_encodings + + # Stream containing batch of encoded questions (INPUT) + question_encodings: question_encodings + + # Stream containing outputs (OUTPUT) + outputs: outputs + +globals: + #################################################################### + # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. + #################################################################### + + # Size of the image encodings input (RETRIEVED) + image_encoding_size: image_encoding_size + + # Size of the question encodings input (RETRIEVED) + question_encoding_size: question_encoding_size + + # Size of the output (RETRIEVED) + output_size: output_size + + #################################################################### + # 4. Keymappings associated with GLOBAL variables that will be SET. + #################################################################### + + #################################################################### + # 5. Keymappings associated with statistics that will be ADDED. + #################################################################### + diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml index 672a638..a678da9 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml @@ -1,14 +1,6 @@ # Load config defining problems for training, validation and testing. default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml -training: - # settings parameters - terminal_conditions: - loss_stop: 1.0e-2 - episode_limit: 100000 - epoch_limit: -1 - - pipeline: name: vqa_med_c2_classification_all_rnn_vgg_concat diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml index 4208a7d..18cf84f 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml @@ -1,14 +1,6 @@ # Load config defining problems for training, validation and testing. default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml -training: - # settings parameters - terminal_conditions: - loss_stop: 1.0e-2 - episode_limit: 10000 - epoch_limit: -1 - - pipeline: name: c2_classification_all_rnn_vgg16_ewm diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml new file mode 100644 index 0000000..258b9ee --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml @@ -0,0 +1,96 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +# Training parameters: +training: + problem: + batch_size: 5 +validation: + problem: + batch_size: 5 + +pipeline: + name: c2_classification_all_rnn_vgg16_mcb + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_encoder_output_size, fused_image_question_activation_size] + values: [200, 1000, 100] + + ################# PIPE 0: question ################# + # Questions encoding. + question_tokenizer: + priority: 1.1 + type: SentenceTokenizer + streams: + inputs: questions + outputs: tokenized_questions + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + use_logsoftmax: False + initial_state_trainable: False + hidden_size: 50 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 3.1 + type: TorchVisionWrapper + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: fusion + classification ################# + # Element wise multiplication + FF. + question_image_fusion: + priority: 4.1 + type: MultimodalCompactBilinearPooling + dropout_rate: 0.5 + streams: + image_encodings: image_activations + question_encodings: question_activations + outputs: fused_image_question_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: fused_image_question_activation_size + + classifier: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + streams: + inputs: fused_image_question_activations + globals: + input_size: fused_image_question_activation_size + prediction_size: vocabulary_size_c2 + + #: pipeline diff --git a/ptp/components/models/__init__.py b/ptp/components/models/__init__.py index 97ab98b..b8b6093 100644 --- a/ptp/components/models/__init__.py +++ b/ptp/components/models/__init__.py @@ -1,5 +1,4 @@ from .convnet_encoder import ConvNetEncoder -from .element_wise_multiplication import ElementWiseMultiplication from .feed_forward_network import FeedForwardNetwork from .index_embeddings import IndexEmbeddings from .torch_vision_wrapper import TorchVisionWrapper @@ -8,9 +7,11 @@ from .recurrent_neural_network import RecurrentNeuralNetwork from .sentence_embeddings import SentenceEmbeddings +from .vqa.element_wise_multiplication import ElementWiseMultiplication +from .vqa.multimodal_compact_bilinear_pooling import MultimodalCompactBilinearPooling + __all__ = [ 'ConvNetEncoder', - 'ElementWiseMultiplication', 'FeedForwardNetwork', 'IndexEmbeddings', 'TorchVisionWrapper', @@ -18,4 +19,6 @@ 'Model', 'RecurrentNeuralNetwork', 'SentenceEmbeddings', + 'ElementWiseMultiplication', + 'MultimodalCompactBilinearPooling', ] diff --git a/ptp/components/models/element_wise_multiplication.py b/ptp/components/models/vqa/element_wise_multiplication.py similarity index 100% rename from ptp/components/models/element_wise_multiplication.py rename to ptp/components/models/vqa/element_wise_multiplication.py diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py new file mode 100644 index 0000000..4b6c7ce --- /dev/null +++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (C) IBM Corporation 2018 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Tomasz Kornuta" + + +import torch +import numpy as np + +from ptp.components.models.model import Model +from ptp.data_types.data_definition import DataDefinition + + +class MultimodalCompactBilinearPooling(Model): + """ + Element of one of classical baselines for Visual Question Answering. + + The model inputs (question and image encodings) are combined with Compact Bilinear Pooling mechanism. + + Fukui, A., Park, D. H., Yang, D., Rohrbach, A., Darrell, T., & Rohrbach, M. (2016). Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847. + + Gao, Y., Beijbom, O., Zhang, N., & Darrell, T. (2016). Compact bilinear pooling. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 317-326). + """ + def __init__(self, name, config): + """ + Initializes the model, creates the required layers. + + :param name: Name of the model (taken from the configuration file). + + :param config: Parameters read from configuration file. + :type config: ``ptp.configuration.ConfigInterface`` + + """ + super(MultimodalCompactBilinearPooling, self).__init__(name, MultimodalCompactBilinearPooling, config) + + # Get key mappings. + self.key_image_encodings = self.stream_keys["image_encodings"] + self.key_question_encodings = self.stream_keys["question_encodings"] + self.key_outputs = self.stream_keys["outputs"] + + # Retrieve input/output sizes from globals. + self.image_encoding_size = self.globals["image_encoding_size"] + self.question_encoding_size = self.globals["question_encoding_size"] + self.output_size = self.globals["output_size"] + + # Create the model. + #self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.output_size) + #self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.output_size) + + # Initialize sketch projection matrices. + self.image_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.image_encoding_size, self.output_size) + self.question_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.question_encoding_size, self.output_size) + + + + + def generate_count_sketch_projection_matrix(self, input_size, output_size): + """ + Initializes Count Sketch projection matrix for given input (size). + Its role will be to project vector v∈Rn to y∈Rd. + We initialize two vectors s∈{−1,1}n and h∈{1,...,d}n: + * s contains either 1 or −1 for each index + * h maps each index i in the input v to an index j in the output y. + Both s and h are initialized randomly from a uniform distribution and remain constant. + """ + # Generate s: 1 or -1 + s = 2 * np.random.randint(2, size=input_size) - 1 + s = torch.from_numpy(s) + #print("s=",s) + + # Generate h (indices) + h = np.random.randint(output_size, size=input_size) + #print("h=",h) + indices = np.concatenate((np.arange(input_size)[..., np.newaxis],h[..., np.newaxis]), axis=1) + indices = torch.from_numpy(indices) + #print("indices=",indices) + + # Generate sparse matrix. + sparse_sketch_matrix = torch.sparse.FloatTensor(indices.t(), s, torch.Size([input_size, output_size])) + #print("\n sparse_sketch_matrix=",sparse_sketch_matrix) + # Return dense matrix. + dense_ssm = sparse_sketch_matrix.to_dense().type(torch.FloatTensor) + #print("\n dense_ssm=",dense_ssm) + return dense_ssm + + + + def input_data_definitions(self): + """ + Function returns a dictionary with definitions of input data that are required by the component. + + :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_image_encodings: DataDefinition([-1, self.image_encoding_size], [torch.Tensor], "Batch of encoded images [BATCH_SIZE x IMAGE_ENCODING_SIZE]"), + self.key_question_encodings: DataDefinition([-1, self.question_encoding_size], [torch.Tensor], "Batch of encoded questions [BATCH_SIZE x QUESTION_ENCODING_SIZE]"), + } + + + def output_data_definitions(self): + """ + Function returns a dictionary with definitions of output data produced the component. + + :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs [BATCH_SIZE x OUTPUT_SIZE]") + } + + def forward(self, data_dict): + """ + Main forward pass of the model. + + :param data_dict: DataDict({'images',**}) + :type data_dict: ``ptp.dadatypes.DataDict`` + """ + + # Unpack DataDict. + enc_img = data_dict[self.key_image_encodings] + enc_q = data_dict[self.key_question_encodings] + #print("\n enc_img=",enc_img) + #print("\n image_sketch_projection_matrix=",self.image_sketch_projection_matrix) + + # Project both batches. + sketch_img = enc_img.mm(self.image_sketch_projection_matrix) + sketch_q = enc_q.mm(self.question_sketch_projection_matrix) + + # Add imaginary parts (with zeros). + sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape)], dim=2) + sketch_q_reim = torch.stack([sketch_q, torch.zeros(sketch_q.shape)], dim=2) + #print("\n sketch_img_reim=",sketch_img_reim) + #print("\n sketch_img_reim.shape=",sketch_img_reim.shape) + + # Perform FFT. + # Returns the real and the imaginary parts together as one tensor of the same shape of input. + fft_img = torch.fft(sketch_img_reim, signal_ndim=1) + fft_q = torch.fft(sketch_q_reim, signal_ndim=1) + #print(fft_img) + + # Get real and imaginary parts. + real1 = fft_img[:,:,0] + imag1 = fft_img[:,:,1] + real2 = fft_q[:,:,0] + imag2 = fft_q[:,:,1] + + # Calculate product. + fft_product = torch.stack([real1 * real2 - imag1 * imag2, real1 * imag2 + imag1 * real2], dim = -1) + #print("fft_product=",fft_product) + + # Inverse FFT. + cbp = torch.ifft(fft_product, signal_ndim=1)[:,:,0] + #print("cbp=",cbp) + + # Add predictions to datadict. + data_dict.extend({self.key_outputs: cbp}) From 1e4b1231720af9ca10f968df5561aa4b21595059 Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 21:44:01 -0700 Subject: [PATCH 08/11] MCB gpu fix --- .../models/vqa/multimodal_compact_bilinear_pooling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py index 4b6c7ce..ed12ec3 100644 --- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py +++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py @@ -93,7 +93,7 @@ def generate_count_sketch_projection_matrix(self, input_size, output_size): sparse_sketch_matrix = torch.sparse.FloatTensor(indices.t(), s, torch.Size([input_size, output_size])) #print("\n sparse_sketch_matrix=",sparse_sketch_matrix) # Return dense matrix. - dense_ssm = sparse_sketch_matrix.to_dense().type(torch.FloatTensor) + dense_ssm = sparse_sketch_matrix.to_dense().type(self.app_state.FloatTensor) #print("\n dense_ssm=",dense_ssm) return dense_ssm From a2055b6ce51962ca551f4ea0f3f55d096dd665ec Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 21:45:20 -0700 Subject: [PATCH 09/11] MCB gpu fix 2 --- .../models/vqa/multimodal_compact_bilinear_pooling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py index ed12ec3..5c40347 100644 --- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py +++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py @@ -140,8 +140,8 @@ def forward(self, data_dict): sketch_q = enc_q.mm(self.question_sketch_projection_matrix) # Add imaginary parts (with zeros). - sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape)], dim=2) - sketch_q_reim = torch.stack([sketch_q, torch.zeros(sketch_q.shape)], dim=2) + sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape).type(self.app_state.FloatTensor)], dim=2) + sketch_q_reim = torch.stack([sketch_q, torch.zeros(sketch_q.shape).type(self.app_state.FloatTensor)], dim=2) #print("\n sketch_img_reim=",sketch_img_reim) #print("\n sketch_img_reim.shape=",sketch_img_reim.shape) From e5189ad01264b580ee4ffac6bc9d3426555fe2f0 Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 21:48:48 -0700 Subject: [PATCH 10/11] MCB - batch size 128 --- .../c2_classification_all_rnn_vgg16_mcb.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml index 258b9ee..0ea068e 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml @@ -2,12 +2,12 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml # Training parameters: -training: - problem: - batch_size: 5 -validation: - problem: - batch_size: 5 +#training: +# problem: +# batch_size: 5 +#validation: +# problem: +# batch_size: 5 pipeline: name: c2_classification_all_rnn_vgg16_mcb From d35c9f7e2514d695158f5db00ffb3b54ac8d6e6f Mon Sep 17 00:00:00 2001 From: tkornut Date: Mon, 22 Apr 2019 21:59:42 -0700 Subject: [PATCH 11/11] cleanups of MCB --- .../models/vqa/multimodal_compact_bilinear_pooling.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py index 5c40347..4e2c6be 100644 --- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py +++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py @@ -34,6 +34,9 @@ class MultimodalCompactBilinearPooling(Model): Fukui, A., Park, D. H., Yang, D., Rohrbach, A., Darrell, T., & Rohrbach, M. (2016). Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847. Gao, Y., Beijbom, O., Zhang, N., & Darrell, T. (2016). Compact bilinear pooling. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 317-326). + + Inspired by implementation from: + https://github.com/DeepInsight-PCALab/CompactBilinearPooling-Pytorch/blob/master/CompactBilinearPooling.py """ def __init__(self, name, config): """ @@ -57,17 +60,11 @@ def __init__(self, name, config): self.question_encoding_size = self.globals["question_encoding_size"] self.output_size = self.globals["output_size"] - # Create the model. - #self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.output_size) - #self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.output_size) - # Initialize sketch projection matrices. self.image_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.image_encoding_size, self.output_size) self.question_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.question_encoding_size, self.output_size) - - def generate_count_sketch_projection_matrix(self, input_size, output_size): """ Initializes Count Sketch projection matrix for given input (size).