From 078d5c0b054ab46fe6706ccf3e9614de531fa6db Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Fri, 19 Apr 2019 16:30:28 -0700
Subject: [PATCH 01/11] c2: 100000

---
 .../c2_classification_all_rnn_vgg16_concat.yml            | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
index d4745b6..0999e73 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
@@ -1,6 +1,14 @@
 # Load config defining problems for training, validation and testing.
 default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
 
+training:
+  # settings parameters
+  terminal_conditions:
+    loss_stop: 1.0e-2
+    episode_limit: 100000
+    epoch_limit: -1
+
+
 pipeline:
   name: vqa_med_c2_classification_all_rnn_vgg_concat
 

From 71b7ff972fc162a39448877af314bc526e81f0dd Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 16:59:09 -0700
Subject: [PATCH 02/11] element wise multiplication component + pipeline for c2

---
 .../models/element_wise_multiplication.yml    |  46 +++++++
 .../c2_classification_all_rnn_vgg16_ewm.yml   |  98 ++++++++++++++
 ptp/components/models/__init__.py             |   2 +
 .../models/element_wise_multiplication.py     | 122 ++++++++++++++++++
 4 files changed, 268 insertions(+)
 create mode 100644 configs/default/components/models/element_wise_multiplication.yml
 create mode 100644 configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
 create mode 100644 ptp/components/models/element_wise_multiplication.py

diff --git a/configs/default/components/models/element_wise_multiplication.yml b/configs/default/components/models/element_wise_multiplication.yml
new file mode 100644
index 0000000..f0f02d8
--- /dev/null
+++ b/configs/default/components/models/element_wise_multiplication.yml
@@ -0,0 +1,46 @@
+# This file defines the default values for the ElementWiseMultiplication model.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+# Dropout rate (LOADED)
+# Default: 0 (means that it is turned off)
+dropout_rate: 0
+
+streams: 
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing batch of encoded images (INPUT)
+  image_encodings: image_encodings
+
+  # Stream containing batch of encoded questions (INPUT)
+  question_encodings: question_encodings
+
+  # Stream containing outputs (OUTPUT)
+  outputs: outputs
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  # Size of the image encodings input (RETRIEVED)
+  image_encoding_size: image_encoding_size
+
+  # Size of the question encodings input (RETRIEVED)
+  question_encoding_size: question_encoding_size
+
+  # Size of the output (RETRIEVED)
+  output_size: output_size
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
new file mode 100644
index 0000000..8614f78
--- /dev/null
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
@@ -0,0 +1,98 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
+
+training:
+  # settings parameters
+  terminal_conditions:
+    loss_stop: 1.0e-2
+    episode_limit: 10000
+    epoch_limit: -1
+
+
+pipeline:
+  name: c2_classification_all_rnn_vgg16_ewm
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size]
+    values: [100, 100, 100]
+
+  # First subpipeline: question.
+  # Questions encoding.
+  question_tokenizer:
+    priority: 1.1
+    type: SentenceTokenizer
+    streams: 
+      inputs: questions
+      outputs: tokenized_questions
+
+  # Model 1: Embeddings
+  question_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions      
+  
+  # Model 2: RNN
+  question_lstm:
+    priority: 1.3
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    prediction_mode: Last
+    use_logsoftmax: False
+    initial_state_trainable: False
+    #num_layers: 5
+    hidden_size: 50
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: embeddings_size
+      prediction_size: question_encoder_output_size
+
+  # 3rd subpipeline: image.
+  # Image encoder.
+  image_encoder:
+    priority: 3.1
+    type: TorchVisionWrapper
+    streams:
+      inputs: images
+      predictions: image_activations
+    globals:
+      prediction_size: image_encoder_output_size
+
+  # 4th subpipeline: element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: ElementWiseMultiplication
+    dropout: 0.5
+    streams:
+      image_encodings: image_activations
+      question_encodings: question_activations
+      outputs: element_wise_activations
+    globals:
+      image_encoding_size: image_encoder_output_size
+      question_encoding_size: question_encoder_output_size
+      output_size: element_wise_activation_size
+
+
+  classifier:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [100]
+    dropout: 0.5
+    streams:
+      inputs: element_wise_activations
+    globals:
+      input_size: element_wise_activation_size
+      prediction_size: vocabulary_size_c2
+
+
+  #: pipeline
diff --git a/ptp/components/models/__init__.py b/ptp/components/models/__init__.py
index 32e95b0..97ab98b 100644
--- a/ptp/components/models/__init__.py
+++ b/ptp/components/models/__init__.py
@@ -1,4 +1,5 @@
 from .convnet_encoder import ConvNetEncoder
+from .element_wise_multiplication import ElementWiseMultiplication
 from .feed_forward_network import FeedForwardNetwork
 from .index_embeddings import IndexEmbeddings
 from .torch_vision_wrapper import TorchVisionWrapper
@@ -9,6 +10,7 @@
 
 __all__ = [
     'ConvNetEncoder',
+    'ElementWiseMultiplication',
     'FeedForwardNetwork',
     'IndexEmbeddings',
     'TorchVisionWrapper',
diff --git a/ptp/components/models/element_wise_multiplication.py b/ptp/components/models/element_wise_multiplication.py
new file mode 100644
index 0000000..a5c88c8
--- /dev/null
+++ b/ptp/components/models/element_wise_multiplication.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) IBM Corporation 2018
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Tomasz Kornuta"
+
+
+import torch
+
+from ptp.components.models.model import Model
+from ptp.data_types.data_definition import DataDefinition
+
+
+class ElementWiseMultiplication(Model):
+    """
+    Element of one of classical baselines for Visual Question Answering.
+    The model inputs (question and image encodings) are fused via element-wise multiplication and returned (for subsequent classification, done in a separate component e.g. ffn).
+
+    On the basis of: Jiasen Lu and Xiao Lin and Dhruv Batra and Devi Parikh. "Deeper LSTM and normalized CNN visual question answering model" (2015).
+    """ 
+    def __init__(self, name, config):
+        """
+        Initializes the model, creates the required layers.
+
+        :param name: Name of the model (taken from the configuration file).
+
+        :param config: Parameters read from configuration file.
+        :type config: ``ptp.configuration.ConfigInterface``
+
+        """
+        super(ElementWiseMultiplication, self).__init__(name, ElementWiseMultiplication, config)
+
+        # Get key mappings.
+        self.key_image_encodings = self.stream_keys["image_encodings"]
+        self.key_question_encodings = self.stream_keys["question_encodings"]
+        self.key_outputs = self.stream_keys["outputs"]
+
+        # Retrieve input/output sizes from globals.
+        self.image_encoding_size = self.globals["image_encoding_size"]
+        self.question_encoding_size = self.globals["question_encoding_size"]
+        self.output_size = self.globals["output_size"]
+
+        # Create the model.
+        self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.output_size)
+        self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.output_size)
+
+        # Create activation layer.
+        self.activation = torch.nn.ReLU()
+
+        # Retrieve dropout rate value - if set, will put dropout between every layer.
+        dropout_rate = self.config["dropout_rate"]
+
+        # Create dropout layer.
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+
+        
+
+    def input_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of input data that are required by the component.
+
+        :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
+        """
+        return {
+            self.key_image_encodings: DataDefinition([-1, self.image_encoding_size], [torch.Tensor], "Batch of encoded images [BATCH_SIZE x IMAGE_ENCODING_SIZE]"),
+            self.key_question_encodings: DataDefinition([-1, self.question_encoding_size], [torch.Tensor], "Batch of encoded questions [BATCH_SIZE x QUESTION_ENCODING_SIZE]"),
+            }
+
+
+    def output_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of output data produced the component.
+
+        :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
+        """
+        return {
+            self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs [BATCH_SIZE x OUTPUT_SIZE]")
+            }
+
+    def forward(self, data_dict):
+        """
+        Main forward pass of the model.
+
+        :param data_dict: DataDict({'images',**})
+        :type data_dict: ``ptp.dadatypes.DataDict``
+        """
+
+        # Unpack DataDict.
+        enc_img = data_dict[self.key_image_encodings]
+        enc_q = data_dict[self.key_question_encodings]
+
+        # Apply nonlinearities and dropout on images.
+        enc_img = self.activation(enc_img)
+        enc_img = self.dropout(enc_img)
+
+        # Apply nonlinearities and dropout on questions.
+        enc_q = self.activation(enc_q)
+        enc_q = self.dropout(enc_q)
+
+        # Pass inputs layers mapping them to the same "latent space".
+        latent_img = self.image_encodings_ff(enc_img)
+        latent_q = self.question_encodings_ff(enc_q)
+        
+        # Element wise multiplication.
+        outputs = latent_img * latent_q
+
+        # Add predictions to datadict.
+        data_dict.extend({self.key_outputs: outputs})

From 0680805f5691ee2a2fa831ded1137debabd1bb1c Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 17:14:26 -0700
Subject: [PATCH 03/11] c2: pipeline with question-image element-wise
 multiplication and late fusion concat with image-size

---
 .../c2_classification_all_rnn_vgg16_ewm.yml   |  10 +-
 ..._classification_all_rnn_vgg16_ewm_size.yml | 137 ++++++++++++++++++
 2 files changed, 141 insertions(+), 6 deletions(-)
 create mode 100644 configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
index 8614f78..1fa84d0 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
@@ -19,7 +19,7 @@ pipeline:
     keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size]
     values: [100, 100, 100]
 
-  # First subpipeline: question.
+  ################# PIPE 0: question #################
   # Questions encoding.
   question_tokenizer:
     priority: 1.1
@@ -48,7 +48,6 @@ pipeline:
     prediction_mode: Last
     use_logsoftmax: False
     initial_state_trainable: False
-    #num_layers: 5
     hidden_size: 50
     streams:
       inputs: embedded_questions
@@ -57,7 +56,7 @@ pipeline:
       input_size: embeddings_size
       prediction_size: question_encoder_output_size
 
-  # 3rd subpipeline: image.
+  ################# PIPE 2: image #################
   # Image encoder.
   image_encoder:
     priority: 3.1
@@ -68,7 +67,8 @@ pipeline:
     globals:
       prediction_size: image_encoder_output_size
 
-  # 4th subpipeline: element wise multiplication + FF.
+  ################# PIPE 3: fusion + classification #################
+  # Element wise multiplication + FF.
   question_image_fusion:
     priority: 4.1
     type: ElementWiseMultiplication
@@ -82,7 +82,6 @@ pipeline:
       question_encoding_size: question_encoder_output_size
       output_size: element_wise_activation_size
 
-
   classifier:
     priority: 4.2
     type: FeedForwardNetwork 
@@ -94,5 +93,4 @@ pipeline:
       input_size: element_wise_activation_size
       prediction_size: vocabulary_size_c2
 
-
   #: pipeline
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
new file mode 100644
index 0000000..ab908a5
--- /dev/null
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
@@ -0,0 +1,137 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
+
+training:
+  # settings parameters
+  terminal_conditions:
+    loss_stop: 1.0e-2
+    episode_limit: 10000
+    epoch_limit: -1
+
+
+pipeline:
+  name: c2_classification_all_rnn_vgg16_ewm_size
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size,image_size_encoder_input_size, image_size_encoder_output_size]
+    values: [100, 100, 100, 2, 10]
+
+  ################# PIPE 0: question #################
+  # Questions encoding.
+  question_tokenizer:
+    priority: 1.1
+    type: SentenceTokenizer
+    streams: 
+      inputs: questions
+      outputs: tokenized_questions
+
+  # Model 1: Embeddings
+  question_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions      
+  
+  # Model 2: RNN
+  question_lstm:
+    priority: 1.3
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    prediction_mode: Last
+    use_logsoftmax: False
+    initial_state_trainable: False
+    hidden_size: 50
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: image #################
+  # Image encoder.
+  image_encoder:
+    priority: 3.1
+    type: TorchVisionWrapper
+    streams:
+      inputs: images
+      predictions: image_activations
+    globals:
+      prediction_size: image_encoder_output_size
+
+  ################# PIPE 3: image-question fusion  #################
+  # Element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: ElementWiseMultiplication
+    dropout: 0.5
+    streams:
+      image_encodings: image_activations
+      question_encodings: question_activations
+      outputs: element_wise_activations
+    globals:
+      image_encoding_size: image_encoder_output_size
+      question_encoding_size: question_encoder_output_size
+      output_size: element_wise_activation_size
+
+  question_image_ffn:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [100]
+    dropout: 0.5
+    streams:
+      inputs: element_wise_activations
+      predictions: question_image_activations
+    globals:
+      input_size: element_wise_activation_size
+      prediction_size: element_wise_activation_size
+
+  ################# PIPE 4: image-question-image size fusion + classification #################
+  # 2nd subpipeline: image size.
+  # Model - image size classifier.
+  image_size_encoder:
+    priority: 5.1
+    type: FeedForwardNetwork 
+    streams:
+      inputs: image_sizes
+      predictions: image_size_activations
+    globals:
+      input_size: image_size_encoder_input_size
+      prediction_size: image_size_encoder_output_size
+
+  # 4th subpipeline: concatenation + FF.
+  concat:
+    priority: 5.2
+    type: Concatenation
+    input_streams: [question_image_activations,image_size_activations]
+    # Concatenation 
+    dim: 1 # default
+    input_dims: [[-1,100],[-1,10]]
+    output_dims: [-1,110]
+    streams:
+      outputs: concatenated_activations
+    globals:
+      output_size: concatentated_activations_size
+
+
+  classifier:
+    priority: 5.3
+    type: FeedForwardNetwork 
+    hidden_sizes: [110]
+    dropout: 0.5
+    streams:
+      inputs: concatenated_activations
+    globals:
+      input_size: concatentated_activations_size
+      prediction_size: vocabulary_size_c2
+
+
+  #: pipeline

From 9a4e958d02ca480a1820e3bf39f3597a01591eaa Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 17:30:32 -0700
Subject: [PATCH 04/11] c2: 128 batch size

---
 .../c2_classification_all_rnn_vgg16_ewm_size.yml          | 8 --------
 .../c2_classification/default_c2_classification.yml       | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
index ab908a5..e867c17 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
@@ -1,14 +1,6 @@
 # Load config defining problems for training, validation and testing.
 default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
 
-training:
-  # settings parameters
-  terminal_conditions:
-    loss_stop: 1.0e-2
-    episode_limit: 10000
-    epoch_limit: -1
-
-
 pipeline:
   name: c2_classification_all_rnn_vgg16_ewm_size
 
diff --git a/configs/vqa_med_2019/c2_classification/default_c2_classification.yml b/configs/vqa_med_2019/c2_classification/default_c2_classification.yml
index 3df45b4..f88e328 100644
--- a/configs/vqa_med_2019/c2_classification/default_c2_classification.yml
+++ b/configs/vqa_med_2019/c2_classification/default_c2_classification.yml
@@ -4,16 +4,24 @@ default_configs: vqa_med_2019/default_vqa_med_2019.yml
 # Training parameters:
 training:
   problem:
+    batch_size: 128
     categories: C2
   sampler:
     name: WeightedRandomSampler
     weights: ~/data/vqa-med/answers.c2.weights.csv
   dataloader:
     num_workers: 4
+  # Termination.
+  terminal_conditions:
+    loss_stop: 1.0e-2
+    episode_limit: 10000
+    epoch_limit: -1
+
 
 # Validation parameters:
 validation:
   problem:
+    batch_size: 128
     categories: C2
   dataloader:
     num_workers: 4

From 957ccf7ec7baf4d7b5ba061a14ae465d98ec9eb1 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 18:55:38 -0700
Subject: [PATCH 05/11] Refactored Wrapper, added option to return feature maps
 (VGG16 only)

---
 .../models/torch_vision_wrapper.yml           | 27 ++++++++--
 ...c1_classification_all_bow_vgg16_concat.yml |  4 +-
 ...c1_classification_all_rnn_vgg16_concat.yml |  4 +-
 ...c2_classification_all_rnn_vgg16_concat.yml |  4 +-
 .../c2_classification_all_rnn_vgg16_ewm.yml   |  4 +-
 ..._classification_all_rnn_vgg16_ewm_size.yml |  4 +-
 ...c3_classification_all_bow_vgg16_concat.yml |  4 +-
 ...c3_classification_all_rnn_vgg16_concat.yml |  4 +-
 .../c3_classification_image_vgg16_softmax.yml |  4 +-
 ...nn_shared_all_encoders_two_ffns_losses.yml |  4 +-
 ...t_rnn_shared_all_encoders_one_ffn_loss.yml |  4 +-
 ...n_shared_all_encoders_four_ffns_losses.yml |  4 +-
 ..._shared_all_encoders_three_ffns_losses.yml |  4 +-
 ptp/components/models/convnet_encoder.py      |  2 +-
 ptp/components/models/torch_vision_wrapper.py | 50 ++++++++++++++-----
 15 files changed, 84 insertions(+), 43 deletions(-)

diff --git a/configs/default/components/models/torch_vision_wrapper.yml b/configs/default/components/models/torch_vision_wrapper.yml
index 28db8d8..968eb9c 100644
--- a/configs/default/components/models/torch_vision_wrapper.yml
+++ b/configs/default/components/models/torch_vision_wrapper.yml
@@ -1,4 +1,4 @@
-# This file defines the default values for the LeNet5 model.
+# This file defines the default values for the component wrapping (pretrained) Torch Vision models.
 
 ####################################################################
 # 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
@@ -8,6 +8,10 @@
 #model_type: VGG16
 # HARDCODED FOR NOW!
 
+# Parameter denoting whether the component will return (flat) prediction 
+# or output of last feature layer (LOADED)
+return_feature_maps: False
+
 streams: 
   ####################################################################
   # 2. Keymappings associated with INPUT and OUTPUT streams.
@@ -16,21 +20,34 @@ streams:
   # Stream containing batch of images (INPUT)
   inputs: inputs
 
-  # Stream containing predictions (OUTPUT)
-  predictions: predictions
+  # Stream containing outputs (features or "predictions") (OUTPUT)
+  outputs: outputs
 
 globals:
   ####################################################################
   # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
   ####################################################################
 
-  # Size of the prediction (RETRIEVED)
-  prediction_size: prediction_size
+  # Size of the output (RETRIEVED)
+  # Used when return_features = False.
+  output_size: output_size
 
   ####################################################################
   # 4. Keymappings associated with GLOBAL variables that will be SET.
   ####################################################################
 
+  # Height of the returned features tensor (SET)
+  # Used when return_features = True.
+  feature_maps_height: feature_maps_height
+
+  # Width of the returned features tensor (SET)
+  # Used when return_features = True.
+  feature_maps_width: feature_maps_width
+
+  # Depth of the returned features tensor (SET)
+  # Used when return_features = True.
+  feature_maps_depth: feature_maps_depth
+
   ####################################################################
   # 5. Keymappings associated with statistics that will be ADDED.
   ####################################################################
diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml b/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml
index b2e6ce1..a38067e 100644
--- a/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c1_classification/c1_classification_all_bow_vgg16_concat.yml
@@ -61,9 +61,9 @@ pipeline:
     priority: 3.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
   
   # 4th subpipeline: concatenation + FF.
   concat:
diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml
index 62b4389..f8942cc 100644
--- a/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c1_classification/c1_classification_all_rnn_vgg16_concat.yml
@@ -68,9 +68,9 @@ pipeline:
     priority: 3.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
   
   # 4th subpipeline: concatenation + FF.
   concat:
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
index 0999e73..fa83133 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
@@ -76,9 +76,9 @@ pipeline:
     priority: 3.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
   
   # 4th subpipeline: concatenation + FF.
   concat:
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
index 1fa84d0..af37b20 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
@@ -63,9 +63,9 @@ pipeline:
     type: TorchVisionWrapper
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
 
   ################# PIPE 3: fusion + classification #################
   # Element wise multiplication + FF.
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
index e867c17..47a780c 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
@@ -55,9 +55,9 @@ pipeline:
     type: TorchVisionWrapper
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
 
   ################# PIPE 3: image-question fusion  #################
   # Element wise multiplication + FF.
diff --git a/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml b/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml
index ed3ed6a..0a02d41 100644
--- a/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c3_classification/c3_classification_all_bow_vgg16_concat.yml
@@ -61,9 +61,9 @@ pipeline:
     priority: 3.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
   
   # 4th subpipeline: concatenation + FF.
   concat:
diff --git a/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml
index 51b30c6..3a634fb 100644
--- a/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c3_classification/c3_classification_all_rnn_vgg16_concat.yml
@@ -68,9 +68,9 @@ pipeline:
     priority: 3.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
   
   # 4th subpipeline: concatenation + FF.
   concat:
diff --git a/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml b/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml
index 9bc9412..1882aca 100644
--- a/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml
+++ b/configs/vqa_med_2019/c3_classification/c3_classification_image_vgg16_softmax.yml
@@ -10,9 +10,9 @@ pipeline:
     priority: 1.1
     streams:
       inputs: images
-      predictions: vgg_images
+      outputs: vgg_images
     globals:
-      prediction_size: vocabulary_size_c3
+      output_size: vocabulary_size_c3
 
   # Model - softmax classifier.
   classifier:
diff --git a/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml b/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml
index 6996f91..1d31f30 100644
--- a/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml
+++ b/configs/vqa_med_2019/vf/c1_binary_vf_cat_rnn_shared_all_encoders_two_ffns_losses.yml
@@ -182,9 +182,9 @@ pipeline:
     priority: 2.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
 
   ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
 
diff --git a/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml b/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml
index ef8f535..4a5a179 100644
--- a/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml
+++ b/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml
@@ -182,9 +182,9 @@ pipeline:
     priority: 2.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
 
   ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
 
diff --git a/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml b/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml
index 56ab04b..8b7b50a 100644
--- a/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml
+++ b/configs/vqa_med_2019/vf/c1_c2_c3_binary_vf_cat_rnn_shared_all_encoders_four_ffns_losses.yml
@@ -182,9 +182,9 @@ pipeline:
     priority: 2.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
 
   ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
 
diff --git a/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml b/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml
index 3b1d952..105f2e3 100644
--- a/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml
+++ b/configs/vqa_med_2019/vf/c1_c3_binary_vf_cat_rnn_shared_all_encoders_three_ffns_losses.yml
@@ -182,9 +182,9 @@ pipeline:
     priority: 2.1
     streams:
       inputs: images
-      predictions: image_activations
+      outputs: image_activations
     globals:
-      prediction_size: image_encoder_output_size
+      output_size: image_encoder_output_size
 
   ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
 
diff --git a/ptp/components/models/convnet_encoder.py b/ptp/components/models/convnet_encoder.py
index 288f0cd..788ef5f 100644
--- a/ptp/components/models/convnet_encoder.py
+++ b/ptp/components/models/convnet_encoder.py
@@ -224,7 +224,7 @@ def output_data_definitions(self):
         :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
         """
         return {
-            self.key_feature_maps: DataDefinition([-1, self.out_channels_conv3, self.height_features_maxpool3, self.width_features_maxpool3], [torch.Tensor], "Batch of filter maps [BATCH_SIZE x DEPTH x HEIGHT x WIDTH]")
+            self.key_feature_maps: DataDefinition([-1, self.out_channels_conv3, self.height_features_maxpool3, self.width_features_maxpool3], [torch.Tensor], "Batch of filter maps [BATCH_SIZE x FEAT_DEPTH x FEAT_HEIGHT x FEAT_WIDTH]")
             }
 
     def forward(self, data_dict):
diff --git a/ptp/components/models/torch_vision_wrapper.py b/ptp/components/models/torch_vision_wrapper.py
index 13692f1..4c5027c 100644
--- a/ptp/components/models/torch_vision_wrapper.py
+++ b/ptp/components/models/torch_vision_wrapper.py
@@ -43,15 +43,34 @@ def __init__(self, name, config):
 
         # Get key mappings.
         self.key_inputs = self.stream_keys["inputs"]
-        self.key_predictions = self.stream_keys["predictions"]
-
-        # Retrieve prediction size from globals.
-        self.prediction_size = self.globals["prediction_size"]
+        self.key_outputs = self.stream_keys["outputs"]
 
         # Get VGG16
         self.model = models.vgg16(pretrained=True)
-        # "Replace" last layer.
-        self.model.classifier._modules['6'] = torch.nn.Linear(4096, self.prediction_size)
+
+        # Check operation mode.
+        self.return_feature_maps = self.config["return_feature_maps"]
+
+        if self.return_feature_maps:
+            # Use only the "feature encoder".
+            self.model = self.model.features
+
+            # Height of the returned features tensor (SET)
+            self.feature_maps_height = 7
+            self.globals["feature_maps_height"] = self.feature_maps_height
+            # Width of the returned features tensor (SET)
+            self.feature_maps_width = 7
+            self.globals["feature_maps_width"] = self.feature_maps_width
+            # Depth of the returned features tensor (SET)
+            self.feature_maps_depth = 512
+            self.globals["feature_maps_depth"] = self.feature_maps_depth
+
+        else:
+            # Use the whole model, but cut/reshape only the last layer.
+            # Retrieve prediction size from globals.
+            self.output_size = self.globals["output_size"]
+            # "Replace" last layer.
+            self.model.classifier._modules['6'] = torch.nn.Linear(4096, self.output_size)
 
 
     def input_data_definitions(self):
@@ -71,9 +90,14 @@ def output_data_definitions(self):
 
         :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
         """
-        return {
-            self.key_predictions: DataDefinition([-1, self.prediction_size], [torch.Tensor], "Batch of predictions, each represented as probability distribution over classes [BATCH_SIZE x PREDICTION_SIZE]")
-            }
+        if self.return_feature_maps:
+            return {
+                self.key_outputs: DataDefinition([-1, self.feature_maps_depth, self.feature_maps_height, self.feature_maps_width], [torch.Tensor], "Batch of feature maps [BATCH_SIZE x FEAT_DEPTH x FEAT_HEIGHT x FEAT_WIDTH]")
+                }
+        else: 
+            return {
+                self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs, each represented as probability distribution over classes [BATCH_SIZE x PREDICTION_SIZE]")
+                }
 
     def forward(self, data_dict):
         """
@@ -82,7 +106,7 @@ def forward(self, data_dict):
         :param data_dict: DataDict({'inputs', ....}), where:
 
             - inputs: expected stream containing images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE WIDTH]
-            - outpus: added stream containing predictions [BATCH_SIZE x PREDICTION_SIZE]
+            - outpus: added stream containing outputs [BATCH_SIZE x PREDICTION_SIZE]
 
         :type data_dict: ``ptp.data_types.DataDict``
 
@@ -91,7 +115,7 @@ def forward(self, data_dict):
         # Unpack DataDict.
         img = data_dict[self.key_inputs]
 
-        predictions = self.model(img)
+        outputs = self.model(img)
 
-        # Add predictions to datadict.
-        data_dict.extend({self.key_predictions: predictions})
+        # Add outputs to datadict.
+        data_dict.extend({self.key_outputs: outputs})

From 3a8f95a34adc0b60b25260e4ec073e6e1d2844ee Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 19:22:26 -0700
Subject: [PATCH 06/11] c2: dropout_rate: 0.5

---
 .../c2_classification_all_rnn_vgg16_concat.yml      | 13 +++++++------
 .../c2_classification_all_rnn_vgg16_ewm.yml         |  4 ++--
 .../c2_classification_all_rnn_vgg16_ewm_size.yml    |  6 +++---
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
index fa83133..672a638 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
@@ -13,8 +13,8 @@ pipeline:
   name: vqa_med_c2_classification_all_rnn_vgg_concat
 
   global_publisher:
-    type: GlobalVariablePublisher
     priority: 0
+    type: GlobalVariablePublisher
     # Add input_size to globals.
     keys: [question_embeddings_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size]
     values: [100, 2, 10, 100]
@@ -30,8 +30,8 @@ pipeline:
 
   # Model 1: Embeddings
   question_embeddings:
-    type: SentenceEmbeddings
     priority: 1.2
+    type: SentenceEmbeddings
     embeddings_size: 50
     pretrained_embeddings_file: glove.6B.50d.txt
     data_folder: ~/data/vqa-med
@@ -42,10 +42,10 @@ pipeline:
   
   # Model 2: RNN
   question_lstm:
+    priority: 1.3
     type: RecurrentNeuralNetwork
     cell_type: LSTM
     prediction_mode: Last
-    priority: 1.3
     use_logsoftmax: False
     initial_state_trainable: False
     #num_layers: 5
@@ -72,8 +72,8 @@ pipeline:
   # 3rd subpipeline: image.
   # Image encoder.
   image_encoder:
-    type: TorchVisionWrapper
     priority: 3.1
+    type: TorchVisionWrapper
     streams:
       inputs: images
       outputs: image_activations
@@ -82,8 +82,8 @@ pipeline:
   
   # 4th subpipeline: concatenation + FF.
   concat:
-    type: Concatenation
     priority: 4.1
+    type: Concatenation
     input_streams: [question_activations,image_size_activations,image_activations]
     # Concatenation 
     dim: 1 # default
@@ -96,9 +96,10 @@ pipeline:
 
 
   classifier:
+    priority: 4.2
     type: FeedForwardNetwork 
     hidden_sizes: [100]
-    priority: 4.2
+    dropout_rate: 0.5
     streams:
       inputs: concatenated_activations
     globals:
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
index af37b20..4208a7d 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
@@ -72,7 +72,7 @@ pipeline:
   question_image_fusion:
     priority: 4.1
     type: ElementWiseMultiplication
-    dropout: 0.5
+    dropout_rate: 0.5
     streams:
       image_encodings: image_activations
       question_encodings: question_activations
@@ -86,7 +86,7 @@ pipeline:
     priority: 4.2
     type: FeedForwardNetwork 
     hidden_sizes: [100]
-    dropout: 0.5
+    dropout_rate: 0.5
     streams:
       inputs: element_wise_activations
     globals:
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
index 47a780c..91ff5d1 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
@@ -64,7 +64,7 @@ pipeline:
   question_image_fusion:
     priority: 4.1
     type: ElementWiseMultiplication
-    dropout: 0.5
+    dropout_rate: 0.5
     streams:
       image_encodings: image_activations
       question_encodings: question_activations
@@ -78,7 +78,7 @@ pipeline:
     priority: 4.2
     type: FeedForwardNetwork 
     hidden_sizes: [100]
-    dropout: 0.5
+    dropout_rate: 0.5
     streams:
       inputs: element_wise_activations
       predictions: question_image_activations
@@ -118,7 +118,7 @@ pipeline:
     priority: 5.3
     type: FeedForwardNetwork 
     hidden_sizes: [110]
-    dropout: 0.5
+    dropout_rate: 0.5
     streams:
       inputs: concatenated_activations
     globals:

From 5d81c5a06704c8423e4da2fc6629d0c09c3ce205 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 21:41:15 -0700
Subject: [PATCH 07/11] Multimodal Compact Bilinear Pooling + pipeline

---
 .../{ => vqa}/element_wise_multiplication.yml |   0
 .../multimodal_compact_bilinear_pooling.yml   |  42 +++++
 ...c2_classification_all_rnn_vgg16_concat.yml |   8 -
 .../c2_classification_all_rnn_vgg16_ewm.yml   |   8 -
 .../c2_classification_all_rnn_vgg16_mcb.yml   |  96 ++++++++++
 ptp/components/models/__init__.py             |   7 +-
 .../{ => vqa}/element_wise_multiplication.py  |   0
 .../multimodal_compact_bilinear_pooling.py    | 169 ++++++++++++++++++
 8 files changed, 312 insertions(+), 18 deletions(-)
 rename configs/default/components/models/{ => vqa}/element_wise_multiplication.yml (100%)
 create mode 100644 configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml
 create mode 100644 configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
 rename ptp/components/models/{ => vqa}/element_wise_multiplication.py (100%)
 create mode 100644 ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py

diff --git a/configs/default/components/models/element_wise_multiplication.yml b/configs/default/components/models/vqa/element_wise_multiplication.yml
similarity index 100%
rename from configs/default/components/models/element_wise_multiplication.yml
rename to configs/default/components/models/vqa/element_wise_multiplication.yml
diff --git a/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml b/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml
new file mode 100644
index 0000000..3d8a98a
--- /dev/null
+++ b/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml
@@ -0,0 +1,42 @@
+# This file defines the default values for the Multimodal Compact Bilinear Pooling model.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+streams: 
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing batch of encoded images (INPUT)
+  image_encodings: image_encodings
+
+  # Stream containing batch of encoded questions (INPUT)
+  question_encodings: question_encodings
+
+  # Stream containing outputs (OUTPUT)
+  outputs: outputs
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  # Size of the image encodings input (RETRIEVED)
+  image_encoding_size: image_encoding_size
+
+  # Size of the question encodings input (RETRIEVED)
+  question_encoding_size: question_encoding_size
+
+  # Size of the output (RETRIEVED)
+  output_size: output_size
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
index 672a638..a678da9 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml
@@ -1,14 +1,6 @@
 # Load config defining problems for training, validation and testing.
 default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
 
-training:
-  # settings parameters
-  terminal_conditions:
-    loss_stop: 1.0e-2
-    episode_limit: 100000
-    epoch_limit: -1
-
-
 pipeline:
   name: vqa_med_c2_classification_all_rnn_vgg_concat
 
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
index 4208a7d..18cf84f 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml
@@ -1,14 +1,6 @@
 # Load config defining problems for training, validation and testing.
 default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
 
-training:
-  # settings parameters
-  terminal_conditions:
-    loss_stop: 1.0e-2
-    episode_limit: 10000
-    epoch_limit: -1
-
-
 pipeline:
   name: c2_classification_all_rnn_vgg16_ewm
 
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
new file mode 100644
index 0000000..258b9ee
--- /dev/null
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
@@ -0,0 +1,96 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    batch_size: 5
+validation:
+  problem:
+    batch_size: 5
+
+pipeline:
+  name: c2_classification_all_rnn_vgg16_mcb
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_encoder_output_size, fused_image_question_activation_size]
+    values: [200, 1000, 100]
+
+  ################# PIPE 0: question #################
+  # Questions encoding.
+  question_tokenizer:
+    priority: 1.1
+    type: SentenceTokenizer
+    streams: 
+      inputs: questions
+      outputs: tokenized_questions
+
+  # Model 1: Embeddings
+  question_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions      
+  
+  # Model 2: RNN
+  question_lstm:
+    priority: 1.3
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    prediction_mode: Last
+    use_logsoftmax: False
+    initial_state_trainable: False
+    hidden_size: 50
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: image #################
+  # Image encoder.
+  image_encoder:
+    priority: 3.1
+    type: TorchVisionWrapper
+    streams:
+      inputs: images
+      outputs: image_activations
+    globals:
+      output_size: image_encoder_output_size
+
+  ################# PIPE 3: fusion + classification #################
+  # Element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: MultimodalCompactBilinearPooling
+    dropout_rate: 0.5
+    streams:
+      image_encodings: image_activations
+      question_encodings: question_activations
+      outputs: fused_image_question_activations
+    globals:
+      image_encoding_size: image_encoder_output_size
+      question_encoding_size: question_encoder_output_size
+      output_size: fused_image_question_activation_size
+
+  classifier:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [100]
+    dropout_rate: 0.5
+    streams:
+      inputs: fused_image_question_activations
+    globals:
+      input_size: fused_image_question_activation_size
+      prediction_size: vocabulary_size_c2
+
+  #: pipeline
diff --git a/ptp/components/models/__init__.py b/ptp/components/models/__init__.py
index 97ab98b..b8b6093 100644
--- a/ptp/components/models/__init__.py
+++ b/ptp/components/models/__init__.py
@@ -1,5 +1,4 @@
 from .convnet_encoder import ConvNetEncoder
-from .element_wise_multiplication import ElementWiseMultiplication
 from .feed_forward_network import FeedForwardNetwork
 from .index_embeddings import IndexEmbeddings
 from .torch_vision_wrapper import TorchVisionWrapper
@@ -8,9 +7,11 @@
 from .recurrent_neural_network import RecurrentNeuralNetwork
 from .sentence_embeddings import SentenceEmbeddings
 
+from .vqa.element_wise_multiplication import ElementWiseMultiplication
+from .vqa.multimodal_compact_bilinear_pooling import MultimodalCompactBilinearPooling
+
 __all__ = [
     'ConvNetEncoder',
-    'ElementWiseMultiplication',
     'FeedForwardNetwork',
     'IndexEmbeddings',
     'TorchVisionWrapper',
@@ -18,4 +19,6 @@
     'Model',
     'RecurrentNeuralNetwork',
     'SentenceEmbeddings',
+    'ElementWiseMultiplication',
+    'MultimodalCompactBilinearPooling',
     ]
diff --git a/ptp/components/models/element_wise_multiplication.py b/ptp/components/models/vqa/element_wise_multiplication.py
similarity index 100%
rename from ptp/components/models/element_wise_multiplication.py
rename to ptp/components/models/vqa/element_wise_multiplication.py
diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
new file mode 100644
index 0000000..4b6c7ce
--- /dev/null
+++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) IBM Corporation 2018
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Tomasz Kornuta"
+
+
+import torch
+import numpy as np
+
+from ptp.components.models.model import Model
+from ptp.data_types.data_definition import DataDefinition
+
+
+class MultimodalCompactBilinearPooling(Model):
+    """
+    Element of one of classical baselines for Visual Question Answering.
+
+    The model inputs (question and image encodings) are combined with Compact Bilinear Pooling mechanism.
+
+    Fukui, A., Park, D. H., Yang, D., Rohrbach, A., Darrell, T., & Rohrbach, M. (2016). Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847.
+
+    Gao, Y., Beijbom, O., Zhang, N., & Darrell, T. (2016). Compact bilinear pooling. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 317-326).
+    """ 
+    def __init__(self, name, config):
+        """
+        Initializes the model, creates the required layers.
+
+        :param name: Name of the model (taken from the configuration file).
+
+        :param config: Parameters read from configuration file.
+        :type config: ``ptp.configuration.ConfigInterface``
+
+        """
+        super(MultimodalCompactBilinearPooling, self).__init__(name, MultimodalCompactBilinearPooling, config)
+
+        # Get key mappings.
+        self.key_image_encodings = self.stream_keys["image_encodings"]
+        self.key_question_encodings = self.stream_keys["question_encodings"]
+        self.key_outputs = self.stream_keys["outputs"]
+
+        # Retrieve input/output sizes from globals.
+        self.image_encoding_size = self.globals["image_encoding_size"]
+        self.question_encoding_size = self.globals["question_encoding_size"]
+        self.output_size = self.globals["output_size"]
+
+        # Create the model.
+        #self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.output_size)
+        #self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.output_size)
+
+        # Initialize sketch projection matrices.
+        self.image_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.image_encoding_size, self.output_size)
+        self.question_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.question_encoding_size, self.output_size)
+
+
+
+
+    def generate_count_sketch_projection_matrix(self, input_size, output_size):
+        """ 
+        Initializes Count Sketch projection matrix for given input (size).
+        Its role will be to project vector v∈Rn to y∈Rd.
+        We initialize two vectors s∈{−1,1}n and h∈{1,...,d}n:
+            * s contains either 1 or −1 for each index
+            * h maps each index i in the input v to an index j in the output y.
+        Both s and h are initialized randomly from a uniform distribution and remain constant.
+        """
+        # Generate s: 1 or -1
+        s = 2 * np.random.randint(2, size=input_size) - 1
+        s = torch.from_numpy(s)
+        #print("s=",s)
+
+        # Generate h (indices)
+        h = np.random.randint(output_size, size=input_size)
+        #print("h=",h)
+        indices = np.concatenate((np.arange(input_size)[..., np.newaxis],h[..., np.newaxis]), axis=1)
+        indices = torch.from_numpy(indices)
+        #print("indices=",indices)
+
+        # Generate sparse matrix.
+        sparse_sketch_matrix = torch.sparse.FloatTensor(indices.t(), s, torch.Size([input_size, output_size]))
+        #print("\n sparse_sketch_matrix=",sparse_sketch_matrix)
+        # Return dense matrix.
+        dense_ssm = sparse_sketch_matrix.to_dense().type(torch.FloatTensor)
+        #print("\n dense_ssm=",dense_ssm)
+        return dense_ssm
+
+        
+
+    def input_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of input data that are required by the component.
+
+        :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
+        """
+        return {
+            self.key_image_encodings: DataDefinition([-1, self.image_encoding_size], [torch.Tensor], "Batch of encoded images [BATCH_SIZE x IMAGE_ENCODING_SIZE]"),
+            self.key_question_encodings: DataDefinition([-1, self.question_encoding_size], [torch.Tensor], "Batch of encoded questions [BATCH_SIZE x QUESTION_ENCODING_SIZE]"),
+            }
+
+
+    def output_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of output data produced the component.
+
+        :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
+        """
+        return {
+            self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs [BATCH_SIZE x OUTPUT_SIZE]")
+            }
+
+    def forward(self, data_dict):
+        """
+        Main forward pass of the model.
+
+        :param data_dict: DataDict({'images',**})
+        :type data_dict: ``ptp.dadatypes.DataDict``
+        """
+
+        # Unpack DataDict.
+        enc_img = data_dict[self.key_image_encodings]
+        enc_q = data_dict[self.key_question_encodings]
+        #print("\n enc_img=",enc_img)
+        #print("\n image_sketch_projection_matrix=",self.image_sketch_projection_matrix)
+
+        # Project both batches.
+        sketch_img = enc_img.mm(self.image_sketch_projection_matrix)
+        sketch_q = enc_q.mm(self.question_sketch_projection_matrix)
+
+        # Add imaginary parts (with zeros).
+        sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape)], dim=2)
+        sketch_q_reim = torch.stack([sketch_q, torch.zeros(sketch_q.shape)], dim=2)
+        #print("\n sketch_img_reim=",sketch_img_reim)
+        #print("\n sketch_img_reim.shape=",sketch_img_reim.shape)
+
+        # Perform FFT.
+        # Returns the real and the imaginary parts together as one tensor of the same shape of input.
+        fft_img = torch.fft(sketch_img_reim, signal_ndim=1)
+        fft_q = torch.fft(sketch_q_reim, signal_ndim=1)
+        #print(fft_img)
+
+        # Get real and imaginary parts.
+        real1 = fft_img[:,:,0]
+        imag1 = fft_img[:,:,1]
+        real2 = fft_q[:,:,0]
+        imag2 = fft_q[:,:,1]
+
+        # Calculate product.
+        fft_product = torch.stack([real1 * real2 - imag1 * imag2, real1 * imag2 + imag1 * real2], dim = -1)
+        #print("fft_product=",fft_product)
+
+        # Inverse FFT.
+        cbp = torch.ifft(fft_product, signal_ndim=1)[:,:,0]
+        #print("cbp=",cbp)
+
+        # Add predictions to datadict.
+        data_dict.extend({self.key_outputs: cbp})

From 1e4b1231720af9ca10f968df5561aa4b21595059 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 21:44:01 -0700
Subject: [PATCH 08/11] MCB gpu fix

---
 .../models/vqa/multimodal_compact_bilinear_pooling.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
index 4b6c7ce..ed12ec3 100644
--- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
+++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
@@ -93,7 +93,7 @@ def generate_count_sketch_projection_matrix(self, input_size, output_size):
         sparse_sketch_matrix = torch.sparse.FloatTensor(indices.t(), s, torch.Size([input_size, output_size]))
         #print("\n sparse_sketch_matrix=",sparse_sketch_matrix)
         # Return dense matrix.
-        dense_ssm = sparse_sketch_matrix.to_dense().type(torch.FloatTensor)
+        dense_ssm = sparse_sketch_matrix.to_dense().type(self.app_state.FloatTensor)
         #print("\n dense_ssm=",dense_ssm)
         return dense_ssm
 

From a2055b6ce51962ca551f4ea0f3f55d096dd665ec Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 21:45:20 -0700
Subject: [PATCH 09/11] MCB gpu fix 2

---
 .../models/vqa/multimodal_compact_bilinear_pooling.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
index ed12ec3..5c40347 100644
--- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
+++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
@@ -140,8 +140,8 @@ def forward(self, data_dict):
         sketch_q = enc_q.mm(self.question_sketch_projection_matrix)
 
         # Add imaginary parts (with zeros).
-        sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape)], dim=2)
-        sketch_q_reim = torch.stack([sketch_q, torch.zeros(sketch_q.shape)], dim=2)
+        sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape).type(self.app_state.FloatTensor)], dim=2)
+        sketch_q_reim = torch.stack([sketch_q, torch.zeros(sketch_q.shape).type(self.app_state.FloatTensor)], dim=2)
         #print("\n sketch_img_reim=",sketch_img_reim)
         #print("\n sketch_img_reim.shape=",sketch_img_reim.shape)
 

From e5189ad01264b580ee4ffac6bc9d3426555fe2f0 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 21:48:48 -0700
Subject: [PATCH 10/11] MCB - batch size 128

---
 .../c2_classification_all_rnn_vgg16_mcb.yml          | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
index 258b9ee..0ea068e 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
@@ -2,12 +2,12 @@
 default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
 
 # Training parameters:
-training:
-  problem:
-    batch_size: 5
-validation:
-  problem:
-    batch_size: 5
+#training:
+#  problem:
+#    batch_size: 5
+#validation:
+#  problem:
+#    batch_size: 5
 
 pipeline:
   name: c2_classification_all_rnn_vgg16_mcb

From d35c9f7e2514d695158f5db00ffb3b54ac8d6e6f Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Mon, 22 Apr 2019 21:59:42 -0700
Subject: [PATCH 11/11] cleanups of MCB

---
 .../models/vqa/multimodal_compact_bilinear_pooling.py    | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
index 5c40347..4e2c6be 100644
--- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
+++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
@@ -34,6 +34,9 @@ class MultimodalCompactBilinearPooling(Model):
     Fukui, A., Park, D. H., Yang, D., Rohrbach, A., Darrell, T., & Rohrbach, M. (2016). Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847.
 
     Gao, Y., Beijbom, O., Zhang, N., & Darrell, T. (2016). Compact bilinear pooling. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 317-326).
+
+    Inspired by implementation from:
+    https://github.com/DeepInsight-PCALab/CompactBilinearPooling-Pytorch/blob/master/CompactBilinearPooling.py
     """ 
     def __init__(self, name, config):
         """
@@ -57,17 +60,11 @@ def __init__(self, name, config):
         self.question_encoding_size = self.globals["question_encoding_size"]
         self.output_size = self.globals["output_size"]
 
-        # Create the model.
-        #self.image_encodings_ff = torch.nn.Linear(self.image_encoding_size, self.output_size)
-        #self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.output_size)
-
         # Initialize sketch projection matrices.
         self.image_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.image_encoding_size, self.output_size)
         self.question_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.question_encoding_size, self.output_size)
 
 
-
-
     def generate_count_sketch_projection_matrix(self, input_size, output_size):
         """ 
         Initializes Count Sketch projection matrix for given input (size).