From 56cabb913722653c8a14d8d475f950a31eeefa3b Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 12:45:35 -0700
Subject: [PATCH 01/14] extend for C4

---
 configs/vqa_med_2019/extend_answers.yml    |  1 +
 configs/vqa_med_2019/extend_answers_c4.yml | 74 ++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 configs/vqa_med_2019/extend_answers_c4.yml

diff --git a/configs/vqa_med_2019/extend_answers.yml b/configs/vqa_med_2019/extend_answers.yml
index 9e2f9a4..304a7c1 100644
--- a/configs/vqa_med_2019/extend_answers.yml
+++ b/configs/vqa_med_2019/extend_answers.yml
@@ -24,6 +24,7 @@ validation_answers:
     type: *p_type
     data_folder: *data_folder
     split: validation
+    categories: all
     resize_image: *resize_image     
     batch_size: 64
     # Appy all preprocessing/data augmentations.
diff --git a/configs/vqa_med_2019/extend_answers_c4.yml b/configs/vqa_med_2019/extend_answers_c4.yml
new file mode 100644
index 0000000..40a1381
--- /dev/null
+++ b/configs/vqa_med_2019/extend_answers_c4.yml
@@ -0,0 +1,74 @@
+# This config is not a standalone config!
+# It adds new sections (sets) without samplers and components for saving answers that we can use for getting final answers.
+
+training_answers:
+  problem:
+    type: &p_type VQAMED2019
+    data_folder: &data_folder ~/data/vqa-med
+    split: training
+    categories: C4
+    resize_image: &resize_image [224, 224]
+    batch_size: 64
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: lowercase,remove_punctuation,tokenize
+    streams: 
+      questions: tokenized_questions
+  dataloader:
+    # No sampler, process samples in the same order.
+    shuffle: false
+    # Use 1 worker, so batches will follow the samples order.
+    num_workers: 1
+
+validation_answers:
+  problem:
+    type: *p_type
+    data_folder: *data_folder
+    split: validation
+    categories: C4
+    resize_image: *resize_image     
+    batch_size: 64
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: lowercase,remove_punctuation,tokenize
+    streams: 
+      questions: tokenized_questions
+  dataloader:
+    # No sampler, process samples in the same order.
+    shuffle: false
+    # Use 1 worker, so batches will follow the samples order.
+    num_workers: 1
+
+
+# Add component for exporting answers to files.
+pipeline:
+  disable: viewer,question_tokenizer
+  # Viewers.
+  viewer_extended:
+    priority: 100.4
+    type: StreamViewer
+    sample_number: 0
+    input_streams: 
+      indices,image_ids,tokenized_questions,
+      category_names,predicted_categories,
+      answers,tokenized_answers,predicted_answers
+
+  answer_exporter:
+    priority: 100.5
+    type: StreamFileExporter
+    separator: '|'
+    filename: 'answers.csv'
+    export_separator_line_to_csv: True
+    input_streams: 
+      indices,image_ids,tokenized_questions,
+      category_names,predicted_categories,
+      answers,tokenized_answers,predicted_answers
+
+  submission_exporter:
+    priority: 100.6
+    type: StreamFileExporter
+    separator: '|'
+    filename: 'submission.txt'
+    input_streams: 
+      image_ids,
+      predicted_answers
+
+#: pipeline

From 71425d2e6641a661496847ebf9d80368566ac15e Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 13:59:08 -0700
Subject: [PATCH 02/14] Component: ReduceTensor + config, config
 c4_word_answer_glove_sum

---
 .../components/transforms/reduce_tensor.yml   |  47 ++++++
 .../c4_word_answer_glove_sum.yml              |  91 ++++++++++++
 .../default_c4_classification.yml             |   4 +-
 ptp/components/transforms/__init__.py         |   2 +
 ptp/components/transforms/list_to_tensor.py   |   2 +-
 ptp/components/transforms/reduce_tensor.py    | 134 ++++++++++++++++++
 6 files changed, 277 insertions(+), 3 deletions(-)
 create mode 100644 configs/default/components/transforms/reduce_tensor.yml
 create mode 100644 configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
 create mode 100644 ptp/components/transforms/reduce_tensor.py

diff --git a/configs/default/components/transforms/reduce_tensor.yml b/configs/default/components/transforms/reduce_tensor.yml
new file mode 100644
index 0000000..57f267b
--- /dev/null
+++ b/configs/default/components/transforms/reduce_tensor.yml
@@ -0,0 +1,47 @@
+# This file defines the default values for the ReduceTensor transformation.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+# Number of input dimensions, including batch (LOADED)
+num_inputs_dims: 2
+
+# Dimension along with the reduction will be applied (LOADED)
+reduction_dim: 1
+
+# Reduction type (LOADED)
+# Options: sum | mean | min | max | argmin | argmax
+reduction_type: sum
+
+# If True, the output tensor is of the same size as input, except dim where it is of size 1 (LOADED)
+keepdim: False
+
+streams: 
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing input tensor (INPUT)
+  inputs: inputs
+
+  # Stream containing output tensor (OUTPUT)
+  outputs: outputs
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  # Size of the intput_item (GET)
+  # (last dimenstion)
+  input_size: input_size
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
new file mode 100644
index 0000000..2c12145
--- /dev/null
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
@@ -0,0 +1,91 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+    
+# Validation parameters:
+validation:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+
+
+pipeline:
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [answer_word_embeddings_size]
+    values: [100]
+
+  # Answer encoding.
+  answer_tokenizer:
+    type: SentenceTokenizer
+    priority: 1.1
+    preprocessing: lowercase,remove_punctuation
+    remove_characters: [“,”,’]
+    streams: 
+      inputs: answers
+      outputs: tokenized_answer_words
+
+  # Model 1: Embeddings
+  answer_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 100
+    pretrained_embeddings_file: glove.6B.100d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: tokenized_answer_words
+      outputs: encoded_answer_words
+    globals:
+      vocabulary_size: answer_words_vocabulary_size
+      word_mappings: answer_words_word_mappings
+
+  answer_reduction:
+    type: ReduceTensor
+    priority: 1.3
+    num_inputs_dims: 3
+    reduction_dim: 1
+    reduction_type: sum
+    keepdim: False
+    streams:
+      inputs: encoded_answer_words
+      outputs: reduced_answers
+    globals:
+      input_size: answer_word_embeddings_size
+
+  # Model.
+  classifier:
+    type: FeedForwardNetwork 
+    hidden_sizes: [500, 500]
+    dropout_rate: 0.5
+    priority: 3
+    streams:
+      inputs: reduced_answers
+    globals:
+      input_size: answer_word_embeddings_size
+      prediction_size: vocabulary_size_c4
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
+#: pipeline
diff --git a/configs/vqa_med_2019/c4_classification/default_c4_classification.yml b/configs/vqa_med_2019/c4_classification/default_c4_classification.yml
index 1e33502..dcfcd52 100644
--- a/configs/vqa_med_2019/c4_classification/default_c4_classification.yml
+++ b/configs/vqa_med_2019/c4_classification/default_c4_classification.yml
@@ -70,8 +70,8 @@ pipeline:
     type: PrecisionRecallStatistics
     priority: 100.3
     use_word_mappings: True
-    show_class_scores: True
-    show_confusion_matrix: True
+    #show_class_scores: True
+    #show_confusion_matrix: True
     streams:
       targets: answers_ids
     globals:
diff --git a/ptp/components/transforms/__init__.py b/ptp/components/transforms/__init__.py
index 36d3845..6200892 100644
--- a/ptp/components/transforms/__init__.py
+++ b/ptp/components/transforms/__init__.py
@@ -1,10 +1,12 @@
 from .concatenation import Concatenation
 from .list_to_tensor import ListToTensor
+from .reduce_tensor import ReduceTensor
 from .reshape_tensor import ReshapeTensor
 
 
 __all__ = [
     'Concatenation',
     'ListToTensor',
+    'ReduceTensor',
     'ReshapeTensor',
     ]
diff --git a/ptp/components/transforms/list_to_tensor.py b/ptp/components/transforms/list_to_tensor.py
index fbf3f21..0f12fb5 100644
--- a/ptp/components/transforms/list_to_tensor.py
+++ b/ptp/components/transforms/list_to_tensor.py
@@ -69,7 +69,7 @@ def input_data_definitions(self):
 
     def output_data_definitions(self):
         """ 
-        Function returns a empty dictionary with definitions of output data produced the component.
+        Function returns a dictionary with definitions of output data produced the component.
 
         :return: Empty dictionary.
         """
diff --git a/ptp/components/transforms/reduce_tensor.py b/ptp/components/transforms/reduce_tensor.py
new file mode 100644
index 0000000..d36ee25
--- /dev/null
+++ b/ptp/components/transforms/reduce_tensor.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) tkornuta, IBM Corporation 2019
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Tomasz Kornuta"
+
+import torch
+
+from ptp.components.component import Component
+from ptp.data_types.data_definition import DataDefinition
+from ptp.configuration.config_parsing import get_value_from_dictionary
+
+
+class ReduceTensor(Component):
+    """
+    Class responsible for reducing tensor using indicated reduction method along a given dimension.
+
+    """
+
+    def __init__(self, name, config):
+        """
+        Initializes object.
+
+        :param name: Name of the component loaded from the configuration file.
+        :type name: str
+
+        :param config: Dictionary of parameters (read from the configuration ``.yaml`` file).
+        :type config: :py:class:`ptp.configuration.ConfigInterface`
+
+        """
+        # Call constructors of parent classes.
+        Component.__init__(self, name, ReduceTensor, config)
+
+        # Set key mappings.
+        self.key_inputs = self.stream_keys["inputs"]
+        self.key_outputs = self.stream_keys["outputs"]
+        
+        # Get number of input dimensions from configuration.
+        self.num_inputs_dims = self.config["num_inputs_dims"]
+        # Get size of a single input item (last dimension) from globals.
+        self.input_size =  self.globals["input_size"]
+
+        # Get reduction tparamsype from configuration.
+        self.dim =  self.config["reduction_dim"]
+        self.keepdim =  self.config["keepdim"]
+
+        # Set reduction type.
+        rt = get_value_from_dictionary(
+            "reduction_type", self.config,
+            'sum | mean | min | max | argmin | argmax'.split(" | ")
+            )
+        reduction_types = {}
+        reduction_types["sum"] = torch.sum
+        reduction_types["mean"] = torch.mean
+        reduction_types["min"] = torch.min
+        reduction_types["max"] = torch.max
+        reduction_types["argmin"] = torch.argmin
+        reduction_types["argmax"] = torch.argmax
+
+        self.reduction = reduction_types[rt]
+
+
+    def input_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of input data that are required by the component.
+
+        :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
+        """
+        # Generate the description of input stream.
+        dims_desc  = ["DIM {}".format(i) for i in range(self.num_inputs_dims-1)]
+        desc = "Batch of outputs [" + " x ".join(dims_desc) + "]"
+        return {
+            self.key_inputs: DataDefinition(
+                [-1]*(self.num_inputs_dims-1) + [self.input_size],
+                [torch.Tensor],
+                desc)
+            }
+
+    def output_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of output data produced the component.
+
+        :return: Empty dictionary.
+        """
+        # Generate the dimensions and description of output stream.
+        if self.keepdim:
+            dims = [-1]*(self.num_inputs_dims-1) + [self.input_size]
+            dims[self.dim] = 1
+            dims_desc  = ["DIM {}".format(i) for i in range(self.num_inputs_dims)]
+            dims_desc[self.dim] = "1"
+            desc = "Batch of outputs [" + " x ".join(dims_desc) + "]"
+        else:
+            dims = [-1]*(self.num_inputs_dims-2) + [self.input_size]
+            dims_desc  = ["DIM {}".format(i) for i in range(self.num_inputs_dims-1)]
+            desc = "Batch of outputs [" + " x ".join(dims_desc) + "]"
+        return {
+            self.key_outputs: DataDefinition(
+                dims,
+                [torch.Tensor],
+                desc)
+            }
+
+
+    def __call__(self, data_dict):
+        """
+        Encodes "inputs" in the format of a single tensor.
+        Stores reshaped tensor in "outputs" field of in data_dict.
+
+        :param data_dict: :py:class:`ptp.utils.DataDict` object containing (among others):
+
+            - "inputs": expected input field containing tensor [BATCH_SIZE x ...]
+
+            - "outputs": added output field containing tensor [BATCH_SIZE x ...] 
+        """
+        # Get inputs to be encoded.
+        inputs = data_dict[self.key_inputs]
+
+        outputs =  self.reduction(inputs, self.dim, self.keepdim)
+
+        # Create the returned dict.
+        data_dict.extend({self.key_outputs: outputs})
+

From 711a1d63c518446c609843d1e9846ca387141cc0 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 14:51:24 -0700
Subject: [PATCH 03/14] C4 wor_answer classifiers

---
 .../components/models/sentence_embeddings.yml |  4 +-
 .../c4_word_answer_glove_sum.yml              |  6 +-
 .../c4_word_answer_mimic_sum.yml              | 91 +++++++++++++++++++
 .../c4_word_answer_onehot_sum.yml             | 91 +++++++++++++++++++
 4 files changed, 188 insertions(+), 4 deletions(-)
 create mode 100644 configs/vqa_med_2019/c4_classification/c4_word_answer_mimic_sum.yml
 create mode 100644 configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_sum.yml

diff --git a/configs/default/components/models/sentence_embeddings.yml b/configs/default/components/models/sentence_embeddings.yml
index 0c9432f..dceb725 100644
--- a/configs/default/components/models/sentence_embeddings.yml
+++ b/configs/default/components/models/sentence_embeddings.yml
@@ -40,7 +40,9 @@ fixed_padding: -1
 
 # File containing pretrained embeddings (LOADED)
 # Empty means that no embeddings will be loaded.
-# Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
+# Options: 
+# '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt |
+# glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
 pretrained_embeddings_file: ''
 
 streams: 
diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
index 2c12145..5c1b1f4 100644
--- a/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
@@ -29,7 +29,7 @@ pipeline:
     type: GlobalVariablePublisher
     # Add input_size to globals.
     keys: [answer_word_embeddings_size]
-    values: [100]
+    values: [300]
 
   # Answer encoding.
   answer_tokenizer:
@@ -45,8 +45,8 @@ pipeline:
   answer_embeddings:
     priority: 1.2
     type: SentenceEmbeddings
-    embeddings_size: 100
-    pretrained_embeddings_file: glove.6B.100d.txt
+    embeddings_size: 300
+    pretrained_embeddings_file: glove.840B.300d.txt
     data_folder: ~/data/vqa-med
     word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv
     export_word_mappings_to_globals: True
diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_mimic_sum.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_mimic_sum.yml
new file mode 100644
index 0000000..db28cc4
--- /dev/null
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_mimic_sum.yml
@@ -0,0 +1,91 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+    
+# Validation parameters:
+validation:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+
+
+pipeline:
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [answer_word_embeddings_size]
+    values: [300]
+
+  # Answer encoding.
+  answer_tokenizer:
+    type: SentenceTokenizer
+    priority: 1.1
+    preprocessing: lowercase,remove_punctuation
+    remove_characters: [“,”,’]
+    streams: 
+      inputs: answers
+      outputs: tokenized_answer_words
+
+  # Model 1: Embeddings
+  answer_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 300
+    pretrained_embeddings_file: mimic.fastText.no_clean.300d.pickled
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: tokenized_answer_words
+      outputs: encoded_answer_words
+    globals:
+      vocabulary_size: answer_words_vocabulary_size
+      word_mappings: answer_words_word_mappings
+
+  answer_reduction:
+    type: ReduceTensor
+    priority: 1.3
+    num_inputs_dims: 3
+    reduction_dim: 1
+    reduction_type: sum
+    keepdim: False
+    streams:
+      inputs: encoded_answer_words
+      outputs: reduced_answers
+    globals:
+      input_size: answer_word_embeddings_size
+
+  # Model.
+  classifier:
+    type: FeedForwardNetwork 
+    hidden_sizes: [500, 500]
+    dropout_rate: 0.5
+    priority: 3
+    streams:
+      inputs: reduced_answers
+    globals:
+      input_size: answer_word_embeddings_size
+      prediction_size: vocabulary_size_c4
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
+#: pipeline
diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_sum.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_sum.yml
new file mode 100644
index 0000000..640f202
--- /dev/null
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_sum.yml
@@ -0,0 +1,91 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+    
+# Validation parameters:
+validation:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+
+
+pipeline:
+  # Answer encoding.
+  answer_tokenizer:
+    type: SentenceTokenizer
+    priority: 1.1
+    preprocessing: lowercase,remove_punctuation
+    remove_characters: [“,”,’]
+    streams: 
+      inputs: answers
+      outputs: tokenized_answer_words
+
+  answer_onehot_encoder:
+    type: SentenceOneHotEncoder
+    priority: 1.2
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: tokenized_answer_words
+      outputs: encoded_answer_words
+    globals:
+      vocabulary_size: answer_words_vocabulary_size
+      word_mappings: answer_words_word_mappings
+
+  answer_to_tensor:
+    type: ListToTensor
+    priority: 1.3
+    num_inputs_dims: 3
+    streams:
+      inputs: encoded_answer_words
+      outputs: tensor_answer_words
+    globals:
+      input_size: answer_words_vocabulary_size
+
+
+  answer_reduction:
+    type: ReduceTensor
+    priority: 1.4
+    num_inputs_dims: 3
+    reduction_dim: 1
+    reduction_type: sum
+    keepdim: False
+    streams:
+      inputs: tensor_answer_words
+      outputs: reduced_answer_words
+    globals:
+      input_size: answer_words_vocabulary_size
+
+  # Model.
+  classifier:
+    type: FeedForwardNetwork 
+    hidden_sizes: [500, 500]
+    dropout_rate: 0.5
+    priority: 3
+    streams:
+      inputs: reduced_answer_words
+    globals:
+      input_size: answer_words_vocabulary_size
+      prediction_size: vocabulary_size_c4
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
+#: pipeline

From 5ad971720e14e23f9c180d414e968eddc83085b1 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 15:18:55 -0700
Subject: [PATCH 04/14] fix in embeddings - some GloVe labels appeared to have
 many words

---
 ptp/components/utils/embeddings.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/ptp/components/utils/embeddings.py b/ptp/components/utils/embeddings.py
index 6d22d71..ca1e0fb 100644
--- a/ptp/components/utils/embeddings.py
+++ b/ptp/components/utils/embeddings.py
@@ -116,12 +116,24 @@ def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embe
             # Parse file and cherry pick the vectors that fit our vocabulary.
             for line in f.readlines():
                 values = line.split()
-                # Get word.
-                word = values[0]
+                if len(values) > embeddings_size+1:
+                    #print(len(values))
+                    # Case: two (or more) words!
+                    num_words = len(values) - embeddings_size
+                    words = values[0:num_words]
+                    word = ' '.join(words)
+                    #print(word)
+                    # Get remaining vector.
+                    vector = np.array(values[num_words:], dtype='float32')
+                else:
+                    # Get word.
+                    word = values[0]
+                    # Get remaining vector.
+                    vector = np.array(values[1:], dtype='float32')
+
                 # Get index.
                 index = word_to_ix.get(word)
                 if index:
-                    vector = np.array(values[1:], dtype='float32')
                     assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!"
                     # Ok, set vector.
                     embeddings[index] = vector

From 5aae937fa0128dd2cd34d1a0b7c53b674d0a4fa1 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 15:39:04 -0700
Subject: [PATCH 05/14] Added bar to loading of word embeddins (as this might
 take a while for bigger embeddings)

---
 .../c4_classification/c4_word_answer_glove_sum.yml    |  6 +++---
 ptp/components/utils/embeddings.py                    | 11 +++++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
index 5c1b1f4..2c12145 100644
--- a/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_glove_sum.yml
@@ -29,7 +29,7 @@ pipeline:
     type: GlobalVariablePublisher
     # Add input_size to globals.
     keys: [answer_word_embeddings_size]
-    values: [300]
+    values: [100]
 
   # Answer encoding.
   answer_tokenizer:
@@ -45,8 +45,8 @@ pipeline:
   answer_embeddings:
     priority: 1.2
     type: SentenceEmbeddings
-    embeddings_size: 300
-    pretrained_embeddings_file: glove.840B.300d.txt
+    embeddings_size: 100
+    pretrained_embeddings_file: glove.6B.100d.txt
     data_folder: ~/data/vqa-med
     word_mappings_file: answer_words.c4.preprocessed.word.mappings.csv
     export_word_mappings_to_globals: True
diff --git a/ptp/components/utils/embeddings.py b/ptp/components/utils/embeddings.py
index ca1e0fb..099684d 100644
--- a/ptp/components/utils/embeddings.py
+++ b/ptp/components/utils/embeddings.py
@@ -19,7 +19,10 @@
 
 import os
 import numpy as np
+import tqdm
+
 import torch
+
 import ptp.components.utils.io as io
 
 
@@ -111,6 +114,9 @@ def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embe
         else: 
             logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder))
 
+        # Get number of lines/vectors.
+        num_lines = sum([1 for line in open(os.path.join(folder, embeddings_name))])
+        t = tqdm.tqdm(total=num_lines)
 
         with open(os.path.join(folder, embeddings_name)) as f:
             # Parse file and cherry pick the vectors that fit our vocabulary.
@@ -130,7 +136,6 @@ def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embe
                     word = values[0]
                     # Get remaining vector.
                     vector = np.array(values[1:], dtype='float32')
-
                 # Get index.
                 index = word_to_ix.get(word)
                 if index:
@@ -139,7 +144,9 @@ def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embe
                     embeddings[index] = vector
                     # Increment counter.
                     num_loaded_embs += 1
-    
+                t.update()
+            t.close()
+
     logger.info("Loaded {} pretrained embeddings for vocabulary of size {} from {}".format(num_loaded_embs, len(word_to_ix), embeddings_name))
 
     # Return matrix with embeddings.

From 2be2380cb6df5ccc004fb06952ef489e570fb5ff Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 19:55:56 -0700
Subject: [PATCH 06/14] Two pipelines for my table

---
 ...lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml |  12 +
 ...ic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml |   6 +-
 ...ve_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml | 383 ++++++++++++++++++
 ...ve_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml | 383 ++++++++++++++++++
 4 files changed, 781 insertions(+), 3 deletions(-)
 create mode 100644 configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
 create mode 100644 configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml

diff --git a/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
index 9f9d7c9..d7ee74b 100644
--- a/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
@@ -39,6 +39,8 @@ hyperparameters:
   answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
 
   batch_size: &batch_size 64
+  preload_images: &preload_images True
+  num_workers: &num_workers 0
 
 # Training parameters:
 training:
@@ -49,10 +51,15 @@ training:
     # Appy all preprocessing/data augmentations.
     question_preprocessing: *question_preprocessing
     image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
     streams: 
       questions: tokenized_questions
   sampler:
     weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+  # Use four workers for loading images.
+  dataloader:
+    num_workers: *num_workers
 
   # Optimizer parameters:
   optimizer:
@@ -67,14 +74,19 @@ training:
 
 # Validation parameters:
 validation:
+  partial_validation_interval: 100
   problem:
     batch_size: *batch_size
     categories: C1,C2,C3
     # Appy all preprocessing/data augmentations.
     question_preprocessing: *question_preprocessing
     image_preprocessing: *image_preprocessing 
+    # Preload images: false, as we will need them only once, at the end.
+    preload_images: false
     streams: 
       questions: tokenized_questions
+  dataloader:
+    num_workers: 1
 
 
 pipeline:
diff --git a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
index 958161a..a4d91bb 100644
--- a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -81,12 +81,12 @@ validation:
     # Appy all preprocessing/data augmentations.
     question_preprocessing: *question_preprocessing
     image_preprocessing: *image_preprocessing 
-    # Preload images.
-    preload_images: *preload_images
+    # Preload images: false, as we will need them only once, at the end.
+    preload_images: false
     streams: 
       questions: tokenized_questions
   dataloader:
-    num_workers: *num_workers
+    num_workers: 1
 
 
 pipeline:
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
new file mode 100644
index 0000000..5168831
--- /dev/null
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -0,0 +1,383 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/default_vqa_med_2019.yml
+
+hyperparameters:
+  # In here I am putting some of the hyperparameters from spreadsheet.
+
+  question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all
+
+  image_preprocessing: &image_preprocessing normalize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | random_affine | random_horizontal_flip | normalize | all
+
+  # Image encoder.
+  image_encoder_model: &image_encoder_model vgg16
+  # Options: vgg16 | densenet121 | resnet152 | resnet50
+  image_encoder_output_size_val: &image_encoder_output_size_val 100
+  
+  # Question encoder.
+  question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
+  # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
+  question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50
+  question_encoder_lstm_size_val: &question_encoder_lstm_size_val 50
+  question_encoder_output_size_val: &question_encoder_output_size_val 100
+  
+  # Fusion I: image + question
+  question_image_fusion_type_val: &question_image_fusion_type ElementWiseMultiplication
+  # Options: ElementWiseMultiplication | ? (component: question_image_fusion)
+  question_image_fusion_size_val: &question_image_fusion_size_val 100
+
+  # Image size encoder.
+  image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
+
+  # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
+  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 110
+
+  # Final classifier: FFN.
+  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
+
+  batch_size: &batch_size 100
+  preload_images: &preload_images True
+  num_workers: &num_workers 0
+
+# Training parameters:
+training:
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
+    streams: 
+      questions: tokenized_questions
+  sampler:
+    weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+  # Use four workers for loading images.
+  dataloader:
+    num_workers: *num_workers
+
+  # Optimizer parameters:
+  optimizer:
+    name: Adam
+    lr: 0.0001
+
+  # Terminal conditions:
+  terminal_conditions:
+    loss_stop: 1.0e-3
+    episode_limit: 10000
+    epoch_limit: -1
+
+# Validation parameters:
+validation:
+  partial_validation_interval: 100
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images: false, as we will need them only once, at the end.
+    preload_images: false
+    streams: 
+      questions: tokenized_questions
+  dataloader:
+    num_workers: 1
+
+
+pipeline:
+  
+  ################# PIPE 0: SHARED #################
+
+  # Add global variables.
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, fused_activation_size]
+    values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val, *image_encoder_output_size_val, *question_image_fusion_size_val]
+
+  # Statistics.
+  batch_size:
+    priority: 0.1
+    type: BatchSizeStatistics
+
+  # Answer encoding.
+  pipe1_all_answer_indexer:
+    priority: 0.2
+    type: LabelIndexer
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv
+    # Export mappings and size to globals.
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: answers
+      outputs: answers_ids
+    globals:
+      vocabulary_size: vocabulary_size_c123_binary_yn
+      word_mappings: word_mappings_c123_binary_yn
+
+
+  ################# PIPE 0: QUESTION CATEGORIZATION #################
+
+  # Add global variables - the ones related to only question categorization.
+  pipe0_global_publisher:
+    priority: 0.3
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [pipe0_question_encoder_output_size]
+    values: [100]
+
+  # Model 1: question embeddings
+  pipe0_question_embeddings:
+    priority: 0.4
+    type: SentenceEmbeddings
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: question_embeddings
+    freeze: True
+    ###################
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: pipe0_embedded_questions
+    globals:
+      embeddings_size: pipe0_embeddings_size
+  
+  # Model 2: question RNN
+  pipe0_lstm:
+    priority: 0.5
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: lstm
+    freeze: True
+    ###################
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: pipe0_embedded_questions
+      predictions: pipe0_question_activations
+    globals:
+      input_size: pipe0_embeddings_size
+      prediction_size: pipe0_question_encoder_output_size
+
+  # Model 3: FFN question category
+  pipe0_classifier:
+    priority: 0.6
+    type: FeedForwardNetwork
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: classifier
+    freeze: True
+    ###################
+    hidden: [50]
+    dropout_rate: 0.7
+    streams:
+      inputs: pipe0_question_activations
+      predictions: pipe0_predicted_question_categories_preds
+    globals:
+      input_size: pipe0_question_encoder_output_size # Set by global publisher
+      prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK
+
+  pipe0_category_decoder:
+    priority: 0.8
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe0_predicted_question_categories_preds
+      outputs: pipe0_predicted_question_categories_names
+    globals:
+      vocabulary_size: num_categories
+      word_mappings: category_word_mappings
+
+  pipe0_category_accuracy:
+    priority: 0.9
+    type: AccuracyStatistics
+    streams:
+      targets: category_ids
+      predictions: pipe0_predicted_question_categories_preds
+    statistics:
+      accuracy: categorization_accuracy
+  
+  ################# PIPE 1: SHARED QUESTION ENCODER #################
+
+  # Model 1: question embeddings
+  pipe1_question_embeddings:
+    priority: 1.1
+    type: SentenceEmbeddings
+    embeddings_size: *question_encoder_embeddings_size_val
+    pretrained_embeddings_file: *question_encoder_embeddings
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions
+    globals:
+      embeddings_size: pipe1_embeddings_size     
+  
+  # Model 2: question RNN
+  pipe1_lstm:
+    priority: 1.2
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    hidden_size: *question_encoder_lstm_size_val
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: pipe1_embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: SHARED IMAGE ENCODER #################
+
+  # Image encoder.
+  image_encoder:
+    priority: 2.1
+    type: TorchVisionWrapper
+    model: *image_encoder_model
+    streams:
+      inputs: images
+      outputs: image_activations
+    globals:
+      output_size: image_encoder_output_size
+
+  ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
+
+  # Model - image size classifier.
+  image_size_encoder:
+    priority: 3.1
+    type: FeedForwardNetwork 
+    use_losfotmax: False
+    streams:
+      inputs: image_sizes
+      predictions: image_size_activations
+    globals:
+      input_size: image_size_encoder_input_size
+      prediction_size: image_size_encoder_output_size
+
+  ################# PIPE 4: image-question fusion  #################
+  # Element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: *question_image_fusion_type
+    dropout_rate: 0.5
+    streams:
+      image_encodings: image_activations
+      question_encodings: question_activations
+      outputs: fused_activations
+    globals:
+      image_encoding_size: image_encoder_output_size
+      question_encoding_size: question_encoder_output_size
+      output_size: fused_activation_size
+
+  question_image_ffn:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [*question_image_fusion_size_val]
+    dropout_rate: 0.5
+    use_logsoftmax: False
+    streams:
+      inputs: fused_activations
+      predictions: question_image_activations
+    globals:
+      input_size: fused_activation_size
+      prediction_size: fused_activation_size
+
+  ################# PIPE 5: image-question-image size fusion #################
+
+  # 5th subpipeline: concatenation 
+  concat:
+    priority: 5.1
+    type: Concatenation
+    input_streams: [question_image_activations,image_size_activations]
+    # Concatenation 
+    dim: 1 # default
+    input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]]
+    output_dims: [-1,*question_image_size_fusion_size_val]
+    streams:
+      outputs: concatenated_activations
+    globals:
+      output_size: concatenated_activations_size
+
+  ################# PIPE 6: C1 + C2 + C3 questions #################
+
+  # Model 4: FFN C123 answering
+  pipe6_c123_answer_classifier:
+    priority: 6.3
+    type: FeedForwardNetwork
+    hidden: *answer_classifier_hidden_sizes_val
+    dropout_rate: 0.5
+    streams:
+      inputs: concatenated_activations
+      predictions: pipe6_c123_predictions
+    globals:
+      input_size: concatenated_activations_size
+      prediction_size: vocabulary_size_c123_binary_yn
+
+  pipe6_c123_nllloss:
+    priority: 6.4
+    type: NLLLoss
+    targets_dim: 1
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+      loss: pipe6_c123_loss
+
+  pipe6_c123_precision_recall:
+    priority: 6.5
+    type: PrecisionRecallStatistics
+    use_word_mappings: True
+    show_class_scores: True
+    #show_confusion_matrix: True
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+    statistics:
+      precision: pipe6_c123_precision
+      recall: pipe6_c123_recall
+      f1score: pipe6_c123_f1score
+
+  # C123 Predictions decoder.
+  pipe5_c123_prediction_decoder:
+    priority: 6.6
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe6_c123_predictions
+      outputs: predicted_answers
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+
+  ################# PIPE 9: MERGE ANSWERS #################
+
+  # Viewers.
+  viewer:
+    priority: 9.3
+    type: StreamViewer
+    input_streams:
+      tokenized_questions,
+      category_names, pipe0_predicted_question_categories_names,
+      answers, predicted_answers
+
+
+#: pipeline
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
new file mode 100644
index 0000000..43a838b
--- /dev/null
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
@@ -0,0 +1,383 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/default_vqa_med_2019.yml
+
+hyperparameters:
+  # In here I am putting some of the hyperparameters from spreadsheet.
+
+  question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all
+
+  image_preprocessing: &image_preprocessing normalize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | random_affine | random_horizontal_flip | normalize | all
+
+  # Image encoder.
+  image_encoder_model: &image_encoder_model vgg16
+  # Options: vgg16 | densenet121 | resnet152 | resnet50
+  image_encoder_output_size_val: &image_encoder_output_size_val 100
+  
+  # Question encoder.
+  question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
+  # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
+  question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50
+  question_encoder_lstm_size_val: &question_encoder_lstm_size_val 50
+  question_encoder_output_size_val: &question_encoder_output_size_val 100
+  
+  # Fusion I: image + question
+  question_image_fusion_type_val: &question_image_fusion_type MultimodalCompactBilinearPooling
+  # Options: ElementWiseMultiplication | MultimodalCompactBilinearPooling | 
+  question_image_fusion_size_val: &question_image_fusion_size_val 100
+
+  # Image size encoder.
+  image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
+
+  # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
+  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 110
+
+  # Final classifier: FFN.
+  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
+
+  batch_size: &batch_size 100
+  preload_images: &preload_images True
+  num_workers: &num_workers 0
+
+# Training parameters:
+training:
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
+    streams: 
+      questions: tokenized_questions
+  sampler:
+    weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+  # Use four workers for loading images.
+  dataloader:
+    num_workers: *num_workers
+
+  # Optimizer parameters:
+  optimizer:
+    name: Adam
+    lr: 0.0001
+
+  # Terminal conditions:
+  terminal_conditions:
+    loss_stop: 1.0e-3
+    episode_limit: 10000
+    epoch_limit: -1
+
+# Validation parameters:
+validation:
+  partial_validation_interval: 100
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images: false, as we will need them only once, at the end.
+    preload_images: false
+    streams: 
+      questions: tokenized_questions
+  dataloader:
+    num_workers: 1
+
+
+pipeline:
+  
+  ################# PIPE 0: SHARED #################
+
+  # Add global variables.
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, fused_activation_size]
+    values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val, *image_encoder_output_size_val, *question_image_fusion_size_val]
+
+  # Statistics.
+  batch_size:
+    priority: 0.1
+    type: BatchSizeStatistics
+
+  # Answer encoding.
+  pipe1_all_answer_indexer:
+    priority: 0.2
+    type: LabelIndexer
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv
+    # Export mappings and size to globals.
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: answers
+      outputs: answers_ids
+    globals:
+      vocabulary_size: vocabulary_size_c123_binary_yn
+      word_mappings: word_mappings_c123_binary_yn
+
+
+  ################# PIPE 0: QUESTION CATEGORIZATION #################
+
+  # Add global variables - the ones related to only question categorization.
+  pipe0_global_publisher:
+    priority: 0.3
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [pipe0_question_encoder_output_size]
+    values: [100]
+
+  # Model 1: question embeddings
+  pipe0_question_embeddings:
+    priority: 0.4
+    type: SentenceEmbeddings
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: question_embeddings
+    freeze: True
+    ###################
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: pipe0_embedded_questions
+    globals:
+      embeddings_size: pipe0_embeddings_size
+  
+  # Model 2: question RNN
+  pipe0_lstm:
+    priority: 0.5
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: lstm
+    freeze: True
+    ###################
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: pipe0_embedded_questions
+      predictions: pipe0_question_activations
+    globals:
+      input_size: pipe0_embeddings_size
+      prediction_size: pipe0_question_encoder_output_size
+
+  # Model 3: FFN question category
+  pipe0_classifier:
+    priority: 0.6
+    type: FeedForwardNetwork
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: classifier
+    freeze: True
+    ###################
+    hidden: [50]
+    dropout_rate: 0.7
+    streams:
+      inputs: pipe0_question_activations
+      predictions: pipe0_predicted_question_categories_preds
+    globals:
+      input_size: pipe0_question_encoder_output_size # Set by global publisher
+      prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK
+
+  pipe0_category_decoder:
+    priority: 0.8
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe0_predicted_question_categories_preds
+      outputs: pipe0_predicted_question_categories_names
+    globals:
+      vocabulary_size: num_categories
+      word_mappings: category_word_mappings
+
+  pipe0_category_accuracy:
+    priority: 0.9
+    type: AccuracyStatistics
+    streams:
+      targets: category_ids
+      predictions: pipe0_predicted_question_categories_preds
+    statistics:
+      accuracy: categorization_accuracy
+  
+  ################# PIPE 1: SHARED QUESTION ENCODER #################
+
+  # Model 1: question embeddings
+  pipe1_question_embeddings:
+    priority: 1.1
+    type: SentenceEmbeddings
+    embeddings_size: *question_encoder_embeddings_size_val
+    pretrained_embeddings_file: *question_encoder_embeddings
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions
+    globals:
+      embeddings_size: pipe1_embeddings_size     
+  
+  # Model 2: question RNN
+  pipe1_lstm:
+    priority: 1.2
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    hidden_size: *question_encoder_lstm_size_val
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: pipe1_embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: SHARED IMAGE ENCODER #################
+
+  # Image encoder.
+  image_encoder:
+    priority: 2.1
+    type: TorchVisionWrapper
+    model: *image_encoder_model
+    streams:
+      inputs: images
+      outputs: image_activations
+    globals:
+      output_size: image_encoder_output_size
+
+  ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
+
+  # Model - image size classifier.
+  image_size_encoder:
+    priority: 3.1
+    type: FeedForwardNetwork 
+    use_losfotmax: False
+    streams:
+      inputs: image_sizes
+      predictions: image_size_activations
+    globals:
+      input_size: image_size_encoder_input_size
+      prediction_size: image_size_encoder_output_size
+
+  ################# PIPE 4: image-question fusion  #################
+  # Element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: *question_image_fusion_type
+    dropout_rate: 0.5
+    streams:
+      image_encodings: image_activations
+      question_encodings: question_activations
+      outputs: fused_activations
+    globals:
+      image_encoding_size: image_encoder_output_size
+      question_encoding_size: question_encoder_output_size
+      output_size: fused_activation_size
+
+  question_image_ffn:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [*question_image_fusion_size_val]
+    dropout_rate: 0.5
+    use_logsoftmax: False
+    streams:
+      inputs: fused_activations
+      predictions: question_image_activations
+    globals:
+      input_size: fused_activation_size
+      prediction_size: fused_activation_size
+
+  ################# PIPE 5: image-question-image size fusion #################
+
+  # 5th subpipeline: concatenation 
+  concat:
+    priority: 5.1
+    type: Concatenation
+    input_streams: [question_image_activations,image_size_activations]
+    # Concatenation 
+    dim: 1 # default
+    input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]]
+    output_dims: [-1,*question_image_size_fusion_size_val]
+    streams:
+      outputs: concatenated_activations
+    globals:
+      output_size: concatenated_activations_size
+
+  ################# PIPE 6: C1 + C2 + C3 questions #################
+
+  # Model 4: FFN C123 answering
+  pipe6_c123_answer_classifier:
+    priority: 6.3
+    type: FeedForwardNetwork
+    hidden: *answer_classifier_hidden_sizes_val
+    dropout_rate: 0.5
+    streams:
+      inputs: concatenated_activations
+      predictions: pipe6_c123_predictions
+    globals:
+      input_size: concatenated_activations_size
+      prediction_size: vocabulary_size_c123_binary_yn
+
+  pipe6_c123_nllloss:
+    priority: 6.4
+    type: NLLLoss
+    targets_dim: 1
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+      loss: pipe6_c123_loss
+
+  pipe6_c123_precision_recall:
+    priority: 6.5
+    type: PrecisionRecallStatistics
+    use_word_mappings: True
+    show_class_scores: True
+    #show_confusion_matrix: True
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+    statistics:
+      precision: pipe6_c123_precision
+      recall: pipe6_c123_recall
+      f1score: pipe6_c123_f1score
+
+  # C123 Predictions decoder.
+  pipe5_c123_prediction_decoder:
+    priority: 6.6
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe6_c123_predictions
+      outputs: predicted_answers
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+
+  ################# PIPE 9: MERGE ANSWERS #################
+
+  # Viewers.
+  viewer:
+    priority: 9.3
+    type: StreamViewer
+    input_streams:
+      tokenized_questions,
+      category_names, pipe0_predicted_question_categories_names,
+      answers, predicted_answers
+
+
+#: pipeline

From e86366345974440a3a958a40f83bbe2a1438dd99 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 20:30:07 -0700
Subject: [PATCH 07/14] glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss config +
 modifications of VQA_Attention - export of output_size to globals

---
 .../components/models/vqa/attention.yml       |   6 +-
 ...ic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml |   4 +
 ...ve_lstm_vgg16_att_is_cat_ffn_c123_loss.yml | 391 ++++++++++++++++++
 ...ve_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml |   4 +
 ptp/components/models/vqa/attention.py        |   2 +
 5 files changed, 404 insertions(+), 3 deletions(-)
 create mode 100644 configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml

diff --git a/configs/default/components/models/vqa/attention.yml b/configs/default/components/models/vqa/attention.yml
index 830f4b8..4018be1 100644
--- a/configs/default/components/models/vqa/attention.yml
+++ b/configs/default/components/models/vqa/attention.yml
@@ -46,13 +46,13 @@ globals:
   # Size of the question encodings input (RETRIEVED)
   question_encoding_size: question_encoding_size
 
-  # Size of the output (RETRIEVED)
-  output_size: output_size
-
   ####################################################################
   # 4. Keymappings associated with GLOBAL variables that will be SET.
   ####################################################################
 
+  # Size of the output (SET)
+  output_size: output_size
+
   ####################################################################
   # 5. Keymappings associated with statistics that will be ADDED.
   ####################################################################
diff --git a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
index a4d91bb..516df63 100644
--- a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -90,6 +90,10 @@ validation:
 
 
 pipeline:
+  # Disable flow 0.
+  disable: 
+    pipe0_question_embeddings,pipe0_question_embeddings,pipe0_lstm,pipe0_classifier,
+    pipe0_category_decoder,pipe0_category_accuracy
   
   ################# PIPE 0: SHARED #################
 
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml
new file mode 100644
index 0000000..fe63930
--- /dev/null
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml
@@ -0,0 +1,391 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/default_vqa_med_2019.yml
+
+hyperparameters:
+  # In here I am putting some of the hyperparameters from spreadsheet.
+
+  question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all
+
+  image_preprocessing: &image_preprocessing normalize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | random_affine | random_horizontal_flip | normalize | all
+
+  # Image encoder.
+  image_encoder_model: &image_encoder_model vgg16
+  # Options: vgg16 | densenet121 | resnet152 | resnet50
+  #image_encoder_output_size_val: &image_encoder_output_size_val 100
+  # INFO: this variable is not important, as we are using features in this pipeline!!
+  
+  # Question encoder.
+  question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
+  # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
+  question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50
+  question_encoder_lstm_size_val: &question_encoder_lstm_size_val 50
+  question_encoder_output_size_val: &question_encoder_output_size_val 100
+  
+  # Fusion I: image + question
+  question_image_fusion_type_val: &question_image_fusion_type VQA_Attention
+  # Options: ElementWiseMultiplication | VQA_Attention
+  #question_image_fusion_size_val: &question_image_fusion_size_val 1124
+  # INFO: this variable is set by VQA_Attention component!
+
+  # Image size encoder.
+  image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
+
+  # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
+  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 1134
+
+  # Final classifier: FFN.
+  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [500]
+
+  batch_size: &batch_size 100
+  preload_images: &preload_images True
+  num_workers: &num_workers 0
+
+# Training parameters:
+training:
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
+    streams: 
+      questions: tokenized_questions
+  sampler:
+    weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+  # Use four workers for loading images.
+  dataloader:
+    num_workers: *num_workers
+
+  # Optimizer parameters:
+  optimizer:
+    name: Adam
+    lr: 0.0001
+
+  # Terminal conditions:
+  terminal_conditions:
+    loss_stop: 1.0e-3
+    episode_limit: 10000
+    epoch_limit: -1
+
+# Validation parameters:
+validation:
+  partial_validation_interval: 100
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images: false, as we will need them only once, at the end.
+    preload_images: false
+    streams: 
+      questions: tokenized_questions
+  dataloader:
+    num_workers: 1
+
+
+pipeline:
+  # Disable flow 0.
+  disable: 
+    pipe0_question_embeddings,pipe0_question_embeddings,pipe0_lstm,pipe0_classifier,
+    pipe0_category_decoder,pipe0_category_accuracy
+  
+  ################# PIPE 0: SHARED #################
+
+  # Add global variables.
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size] #, image_encoder_output_size] #, fused_activation_size]
+    values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val] #, *image_encoder_output_size_val] #, *question_image_fusion_size_val]
+
+  # Statistics.
+  batch_size:
+    priority: 0.1
+    type: BatchSizeStatistics
+
+  # Answer encoding.
+  pipe1_all_answer_indexer:
+    priority: 0.2
+    type: LabelIndexer
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv
+    # Export mappings and size to globals.
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: answers
+      outputs: answers_ids
+    globals:
+      vocabulary_size: vocabulary_size_c123_binary_yn
+      word_mappings: word_mappings_c123_binary_yn
+
+
+  ################# PIPE 0: QUESTION CATEGORIZATION #################
+
+  # Add global variables - the ones related to only question categorization.
+  pipe0_global_publisher:
+    priority: 0.3
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [pipe0_question_encoder_output_size]
+    values: [100]
+
+  # Model 1: question embeddings
+  pipe0_question_embeddings:
+    priority: 0.4
+    type: SentenceEmbeddings
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: question_embeddings
+    freeze: True
+    ###################
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: pipe0_embedded_questions
+    globals:
+      embeddings_size: pipe0_embeddings_size
+  
+  # Model 2: question RNN
+  pipe0_lstm:
+    priority: 0.5
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: lstm
+    freeze: True
+    ###################
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: pipe0_embedded_questions
+      predictions: pipe0_question_activations
+    globals:
+      input_size: pipe0_embeddings_size
+      prediction_size: pipe0_question_encoder_output_size
+
+  # Model 3: FFN question category
+  pipe0_classifier:
+    priority: 0.6
+    type: FeedForwardNetwork
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: classifier
+    freeze: True
+    ###################
+    hidden: [50]
+    dropout_rate: 0.7
+    streams:
+      inputs: pipe0_question_activations
+      predictions: pipe0_predicted_question_categories_preds
+    globals:
+      input_size: pipe0_question_encoder_output_size # Set by global publisher
+      prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK
+
+  pipe0_category_decoder:
+    priority: 0.8
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe0_predicted_question_categories_preds
+      outputs: pipe0_predicted_question_categories_names
+    globals:
+      vocabulary_size: num_categories
+      word_mappings: category_word_mappings
+
+  pipe0_category_accuracy:
+    priority: 0.9
+    type: AccuracyStatistics
+    streams:
+      targets: category_ids
+      predictions: pipe0_predicted_question_categories_preds
+    statistics:
+      accuracy: categorization_accuracy
+  
+  ################# PIPE 1: SHARED QUESTION ENCODER #################
+
+  # Model 1: question embeddings
+  pipe1_question_embeddings:
+    priority: 1.1
+    type: SentenceEmbeddings
+    embeddings_size: *question_encoder_embeddings_size_val
+    pretrained_embeddings_file: *question_encoder_embeddings
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions
+    globals:
+      embeddings_size: pipe1_embeddings_size     
+  
+  # Model 2: question RNN
+  pipe1_lstm:
+    priority: 1.2
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    hidden_size: *question_encoder_lstm_size_val
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: pipe1_embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: SHARED IMAGE ENCODER #################
+
+  # Image encoder.
+  image_encoder:
+    priority: 2.1
+    type: TorchVisionWrapper
+    model: *image_encoder_model
+    return_feature_maps: True
+    streams:
+      inputs: images
+      outputs: feature_maps
+
+  ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
+
+  # Model - image size classifier.
+  image_size_encoder:
+    priority: 3.1
+    type: FeedForwardNetwork 
+    use_losfotmax: False
+    streams:
+      inputs: image_sizes
+      predictions: image_size_activations
+    globals:
+      input_size: image_size_encoder_input_size
+      prediction_size: image_size_encoder_output_size
+
+  ################# PIPE 4: image-question fusion  #################
+  # Attention + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: *question_image_fusion_type
+    dropout_rate: 0.5
+    # Attention params.
+    latent_size: 100
+    num_attention_heads: 2
+    streams:
+      image_encodings: feature_maps
+      question_encodings: question_activations
+      outputs: fused_activations
+    globals:
+      question_encoding_size: question_encoder_output_size
+      output_size: fused_activation_size
+
+
+  question_image_ffn:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    #hidden_sizes: [*question_image_fusion_size_val]
+    dropout_rate: 0.5
+    use_logsoftmax: False
+    streams:
+      inputs: fused_activations
+      predictions: question_image_activations
+    globals:
+      input_size: fused_activation_size
+      prediction_size: fused_activation_size
+
+  ################# PIPE 5: image-question-image size fusion #################
+
+  # 5th subpipeline: concatenation 
+  concat:
+    priority: 5.1
+    type: Concatenation
+    input_streams: [question_image_activations,image_size_activations]
+    # Concatenation 
+    dim: 1 # default
+    input_dims: [[-1,1124],[-1,*image_size_encoder_output_size_val]]
+    output_dims: [-1,*question_image_size_fusion_size_val]
+    streams:
+      outputs: concatenated_activations
+    globals:
+      output_size: concatenated_activations_size
+
+  ################# PIPE 6: C1 + C2 + C3 questions #################
+
+  # Model 4: FFN C123 answering
+  pipe6_c123_answer_classifier:
+    priority: 6.3
+    type: FeedForwardNetwork
+    hidden: *answer_classifier_hidden_sizes_val
+    dropout_rate: 0.5
+    streams:
+      inputs: concatenated_activations
+      predictions: pipe6_c123_predictions
+    globals:
+      input_size: concatenated_activations_size
+      prediction_size: vocabulary_size_c123_binary_yn
+
+  pipe6_c123_nllloss:
+    priority: 6.4
+    type: NLLLoss
+    targets_dim: 1
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+      loss: pipe6_c123_loss
+
+  pipe6_c123_precision_recall:
+    priority: 6.5
+    type: PrecisionRecallStatistics
+    use_word_mappings: True
+    show_class_scores: True
+    #show_confusion_matrix: True
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+    statistics:
+      precision: pipe6_c123_precision
+      recall: pipe6_c123_recall
+      f1score: pipe6_c123_f1score
+
+  # C123 Predictions decoder.
+  pipe5_c123_prediction_decoder:
+    priority: 6.6
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe6_c123_predictions
+      outputs: predicted_answers
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+
+  ################# PIPE 9: MERGE ANSWERS #################
+
+  # Viewers.
+  viewer:
+    priority: 9.3
+    type: StreamViewer
+    input_streams:
+      tokenized_questions,
+      category_names, pipe0_predicted_question_categories_names,
+      answers, predicted_answers
+
+
+#: pipeline
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
index 5168831..01d13e4 100644
--- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -90,6 +90,10 @@ validation:
 
 
 pipeline:
+  # Disable flow 0.
+  disable: 
+    pipe0_question_embeddings,pipe0_question_embeddings,pipe0_lstm,pipe0_classifier,
+    pipe0_category_decoder,pipe0_category_accuracy
   
   ################# PIPE 0: SHARED #################
 
diff --git a/ptp/components/models/vqa/attention.py b/ptp/components/models/vqa/attention.py
index 15c7914..88c868f 100644
--- a/ptp/components/models/vqa/attention.py
+++ b/ptp/components/models/vqa/attention.py
@@ -62,6 +62,8 @@ def __init__(self, name, config):
 
         # Output feature size
         self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size
+        # Export to globals.
+        self.globals["output_size"] = self.output_size
 
         # Map image and question encodings to a common latent space of dimension 'latent_size'.
         self.image_encodings_conv = torch.nn.Conv2d(self.feature_maps_depth, self.latent_size, 1, bias=False)

From f428b311be43ed67d95c13d10ee7d8713801bed6 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 20:36:08 -0700
Subject: [PATCH 08/14] disabling components related to question categorization

---
 .../glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml          | 4 ++++
 .../tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml         | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
index d7ee74b..65eed52 100644
--- a/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
@@ -90,6 +90,10 @@ validation:
 
 
 pipeline:
+  # Disable flow 0.
+  disable: 
+    pipe0_question_embeddings,pipe0_question_embeddings,pipe0_lstm,pipe0_classifier,
+    pipe0_category_decoder,pipe0_category_accuracy
   
   ################# PIPE 0: SHARED #################
 
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
index 43a838b..f7d4d89 100644
--- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
@@ -90,6 +90,10 @@ validation:
 
 
 pipeline:
+  # Disable flow 0.
+  disable: 
+    pipe0_question_embeddings,pipe0_question_embeddings,pipe0_lstm,pipe0_classifier,
+    pipe0_category_decoder,pipe0_category_accuracy
   
   ################# PIPE 0: SHARED #################
 

From 92a7b24cf6937bba393a9208fc5a3443cf604056 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sun, 5 May 2019 09:54:16 -0700
Subject: [PATCH 09/14] config with att + resnet152

---
 ...stm_resnet152_att_is_cat_ffn_c123_loss.yml | 391 ++++++++++++++++++
 1 file changed, 391 insertions(+)
 create mode 100644 configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml

diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml
new file mode 100644
index 0000000..cad4b5b
--- /dev/null
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml
@@ -0,0 +1,391 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/default_vqa_med_2019.yml
+
+hyperparameters:
+  # In here I am putting some of the hyperparameters from spreadsheet.
+
+  question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all
+
+  image_preprocessing: &image_preprocessing normalize
+  # Accepted formats: a,b,c or [a,b,c]
+  # none | random_affine | random_horizontal_flip | normalize | all
+
+  # Image encoder.
+  image_encoder_model: &image_encoder_model resnet152
+  # Options: vgg16 | densenet121 | resnet152 | resnet50
+  #image_encoder_output_size_val: &image_encoder_output_size_val 100
+  # INFO: this variable is not important, as we are using features in this pipeline!!
+  
+  # Question encoder.
+  question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
+  # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
+  question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50
+  question_encoder_lstm_size_val: &question_encoder_lstm_size_val 50
+  question_encoder_output_size_val: &question_encoder_output_size_val 100
+  
+  # Fusion I: image + question
+  question_image_fusion_type_val: &question_image_fusion_type VQA_Attention
+  # Options: ElementWiseMultiplication | VQA_Attention
+  #question_image_fusion_size_val: &question_image_fusion_size_val 1124
+  # INFO: this variable is set by VQA_Attention component!
+
+  # Image size encoder.
+  image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
+
+  # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
+  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 1134
+
+  # Final classifier: FFN.
+  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [500]
+
+  batch_size: &batch_size 256
+  preload_images: &preload_images True
+  num_workers: &num_workers 0
+
+# Training parameters:
+training:
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
+    streams: 
+      questions: tokenized_questions
+  sampler:
+    weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+  # Use four workers for loading images.
+  dataloader:
+    num_workers: *num_workers
+
+  # Optimizer parameters:
+  optimizer:
+    name: Adam
+    lr: 0.0001
+
+  # Terminal conditions:
+  terminal_conditions:
+    loss_stop: 1.0e-3
+    episode_limit: 10000
+    epoch_limit: -1
+
+# Validation parameters:
+validation:
+  partial_validation_interval: 100
+  problem:
+    batch_size: *batch_size
+    categories: C1,C2,C3
+    # Appy all preprocessing/data augmentations.
+    question_preprocessing: *question_preprocessing
+    image_preprocessing: *image_preprocessing 
+    # Preload images: false, as we will need them only once, at the end.
+    preload_images: false
+    streams: 
+      questions: tokenized_questions
+  dataloader:
+    num_workers: 1
+
+
+pipeline:
+  # Disable flow 0.
+  disable: 
+    pipe0_question_embeddings,pipe0_question_embeddings,pipe0_lstm,pipe0_classifier,
+    pipe0_category_decoder,pipe0_category_accuracy
+  
+  ################# PIPE 0: SHARED #################
+
+  # Add global variables.
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size] #, image_encoder_output_size] #, fused_activation_size]
+    values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val] #, *image_encoder_output_size_val] #, *question_image_fusion_size_val]
+
+  # Statistics.
+  batch_size:
+    priority: 0.1
+    type: BatchSizeStatistics
+
+  # Answer encoding.
+  pipe1_all_answer_indexer:
+    priority: 0.2
+    type: LabelIndexer
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv
+    # Export mappings and size to globals.
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: answers
+      outputs: answers_ids
+    globals:
+      vocabulary_size: vocabulary_size_c123_binary_yn
+      word_mappings: word_mappings_c123_binary_yn
+
+
+  ################# PIPE 0: QUESTION CATEGORIZATION #################
+
+  # Add global variables - the ones related to only question categorization.
+  pipe0_global_publisher:
+    priority: 0.3
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [pipe0_question_encoder_output_size]
+    values: [100]
+
+  # Model 1: question embeddings
+  pipe0_question_embeddings:
+    priority: 0.4
+    type: SentenceEmbeddings
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: question_embeddings
+    freeze: True
+    ###################
+    embeddings_size: 50
+    pretrained_embeddings_file: glove.6B.50d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: pipe0_embedded_questions
+    globals:
+      embeddings_size: pipe0_embeddings_size
+  
+  # Model 2: question RNN
+  pipe0_lstm:
+    priority: 0.5
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: lstm
+    freeze: True
+    ###################
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: pipe0_embedded_questions
+      predictions: pipe0_question_activations
+    globals:
+      input_size: pipe0_embeddings_size
+      prediction_size: pipe0_question_encoder_output_size
+
+  # Model 3: FFN question category
+  pipe0_classifier:
+    priority: 0.6
+    type: FeedForwardNetwork
+    # LOAD AND FREEZE #
+    load: 
+      file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt
+      model: classifier
+    freeze: True
+    ###################
+    hidden: [50]
+    dropout_rate: 0.7
+    streams:
+      inputs: pipe0_question_activations
+      predictions: pipe0_predicted_question_categories_preds
+    globals:
+      input_size: pipe0_question_encoder_output_size # Set by global publisher
+      prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK
+
+  pipe0_category_decoder:
+    priority: 0.8
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe0_predicted_question_categories_preds
+      outputs: pipe0_predicted_question_categories_names
+    globals:
+      vocabulary_size: num_categories
+      word_mappings: category_word_mappings
+
+  pipe0_category_accuracy:
+    priority: 0.9
+    type: AccuracyStatistics
+    streams:
+      targets: category_ids
+      predictions: pipe0_predicted_question_categories_preds
+    statistics:
+      accuracy: categorization_accuracy
+  
+  ################# PIPE 1: SHARED QUESTION ENCODER #################
+
+  # Model 1: question embeddings
+  pipe1_question_embeddings:
+    priority: 1.1
+    type: SentenceEmbeddings
+    embeddings_size: *question_encoder_embeddings_size_val
+    pretrained_embeddings_file: *question_encoder_embeddings
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions
+    globals:
+      embeddings_size: pipe1_embeddings_size     
+  
+  # Model 2: question RNN
+  pipe1_lstm:
+    priority: 1.2
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    hidden_size: *question_encoder_lstm_size_val
+    prediction_mode: Last
+    initial_state: Trainable
+    use_logsoftmax: False
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: pipe1_embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: SHARED IMAGE ENCODER #################
+
+  # Image encoder.
+  image_encoder:
+    priority: 2.1
+    type: TorchVisionWrapper
+    model: *image_encoder_model
+    return_feature_maps: True
+    streams:
+      inputs: images
+      outputs: feature_maps
+
+  ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
+
+  # Model - image size classifier.
+  image_size_encoder:
+    priority: 3.1
+    type: FeedForwardNetwork 
+    use_losfotmax: False
+    streams:
+      inputs: image_sizes
+      predictions: image_size_activations
+    globals:
+      input_size: image_size_encoder_input_size
+      prediction_size: image_size_encoder_output_size
+
+  ################# PIPE 4: image-question fusion  #################
+  # Attention + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: *question_image_fusion_type
+    dropout_rate: 0.5
+    # Attention params.
+    latent_size: 100
+    num_attention_heads: 2
+    streams:
+      image_encodings: feature_maps
+      question_encodings: question_activations
+      outputs: fused_activations
+    globals:
+      question_encoding_size: question_encoder_output_size
+      output_size: fused_activation_size
+
+
+  question_image_ffn:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    #hidden_sizes: [*question_image_fusion_size_val]
+    dropout_rate: 0.5
+    use_logsoftmax: False
+    streams:
+      inputs: fused_activations
+      predictions: question_image_activations
+    globals:
+      input_size: fused_activation_size
+      prediction_size: fused_activation_size
+
+  ################# PIPE 5: image-question-image size fusion #################
+
+  # 5th subpipeline: concatenation 
+  concat:
+    priority: 5.1
+    type: Concatenation
+    input_streams: [question_image_activations,image_size_activations]
+    # Concatenation 
+    dim: 1 # default
+    input_dims: [[-1,1124],[-1,*image_size_encoder_output_size_val]]
+    output_dims: [-1,*question_image_size_fusion_size_val]
+    streams:
+      outputs: concatenated_activations
+    globals:
+      output_size: concatenated_activations_size
+
+  ################# PIPE 6: C1 + C2 + C3 questions #################
+
+  # Model 4: FFN C123 answering
+  pipe6_c123_answer_classifier:
+    priority: 6.3
+    type: FeedForwardNetwork
+    hidden: *answer_classifier_hidden_sizes_val
+    dropout_rate: 0.5
+    streams:
+      inputs: concatenated_activations
+      predictions: pipe6_c123_predictions
+    globals:
+      input_size: concatenated_activations_size
+      prediction_size: vocabulary_size_c123_binary_yn
+
+  pipe6_c123_nllloss:
+    priority: 6.4
+    type: NLLLoss
+    targets_dim: 1
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+      loss: pipe6_c123_loss
+
+  pipe6_c123_precision_recall:
+    priority: 6.5
+    type: PrecisionRecallStatistics
+    use_word_mappings: True
+    show_class_scores: True
+    #show_confusion_matrix: True
+    streams:
+      predictions: pipe6_c123_predictions
+      targets: answers_ids
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+    statistics:
+      precision: pipe6_c123_precision
+      recall: pipe6_c123_recall
+      f1score: pipe6_c123_f1score
+
+  # C123 Predictions decoder.
+  pipe5_c123_prediction_decoder:
+    priority: 6.6
+    type: WordDecoder
+    # Use the same word mappings as label indexer.
+    import_word_mappings_from_globals: True
+    streams:
+      inputs: pipe6_c123_predictions
+      outputs: predicted_answers
+    globals:
+      word_mappings: word_mappings_c123_binary_yn
+
+  ################# PIPE 9: MERGE ANSWERS #################
+
+  # Viewers.
+  viewer:
+    priority: 9.3
+    type: StreamViewer
+    input_streams:
+      tokenized_questions,
+      category_names, pipe0_predicted_question_categories_names,
+      answers, predicted_answers
+
+
+#: pipeline

From 2aef2990de0f14e162340a40a2edfdb261336389 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sun, 5 May 2019 11:30:34 -0700
Subject: [PATCH 10/14] Fix in MCB enabling loading/storing projection
 matrices, flag for making them trainable

---
 .../multimodal_compact_bilinear_pooling.yml   |  4 +++
 .../c2_classification_all_rnn_vgg16_mcb.yml   |  2 +-
 .../multimodal_compact_bilinear_pooling.py    | 28 +++++++++++--------
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml b/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml
index 3d8a98a..9c5d8f4 100644
--- a/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml
+++ b/configs/default/components/models/vqa/multimodal_compact_bilinear_pooling.yml
@@ -4,6 +4,10 @@
 # 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
 ####################################################################
 
+# Parameter denoting whether projection matrices are trainable (LOADED)
+# Setting flag that to true will result in trainable, dense (i.e. not "sketch") projection layers.
+trainable_projections: False
+
 streams: 
   ####################################################################
   # 2. Keymappings associated with INPUT and OUTPUT streams.
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
index d28a24f..075c958 100644
--- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
+++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
@@ -8,7 +8,7 @@ pipeline:
     type: GlobalVariablePublisher
     # Add input_size to globals.
     keys: [question_encoder_output_size, image_encoder_output_size, fused_image_question_activation_size]
-    values: [200, 1000, 100]
+    values: [200, 500, 100]
 
   ################# PIPE 0: question #################
   # Questions encoding.
diff --git a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
index 4e2c6be..0a3953c 100644
--- a/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
+++ b/ptp/components/models/vqa/multimodal_compact_bilinear_pooling.py
@@ -61,8 +61,13 @@ def __init__(self, name, config):
         self.output_size = self.globals["output_size"]
 
         # Initialize sketch projection matrices.
-        self.image_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.image_encoding_size, self.output_size)
-        self.question_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.question_encoding_size, self.output_size)
+        image_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.image_encoding_size, self.output_size)
+        question_sketch_projection_matrix = self.generate_count_sketch_projection_matrix(self.question_encoding_size, self.output_size)
+
+        # Make them parameters of the model, so can be stored/loaded and trained (optionally).
+        trainable_projections = self.config["trainable_projections"]
+        self.image_sketch_projection_matrix = torch.nn.Parameter(image_sketch_projection_matrix, requires_grad=trainable_projections)
+        self.question_sketch_projection_matrix = torch.nn.Parameter(question_sketch_projection_matrix, requires_grad=trainable_projections)
 
 
     def generate_count_sketch_projection_matrix(self, input_size, output_size):
@@ -77,21 +82,22 @@ def generate_count_sketch_projection_matrix(self, input_size, output_size):
         # Generate s: 1 or -1
         s = 2 * np.random.randint(2, size=input_size) - 1
         s = torch.from_numpy(s)
-        #print("s=",s)
+        #print("s=",s.shape)
 
         # Generate h (indices)
         h = np.random.randint(output_size, size=input_size)
-        #print("h=",h)
+        #print("h=",h.shape)
         indices = np.concatenate((np.arange(input_size)[..., np.newaxis],h[..., np.newaxis]), axis=1)
         indices = torch.from_numpy(indices)
-        #print("indices=",indices)
+        #print("indices=",indices.shape)
 
         # Generate sparse matrix.
         sparse_sketch_matrix = torch.sparse.FloatTensor(indices.t(), s, torch.Size([input_size, output_size]))
-        #print("\n sparse_sketch_matrix=",sparse_sketch_matrix)
+        #print("\n sparse_sketch_matrix=",sparse_sketch_matrix.shape)
         # Return dense matrix.
         dense_ssm = sparse_sketch_matrix.to_dense().type(self.app_state.FloatTensor)
         #print("\n dense_ssm=",dense_ssm)
+
         return dense_ssm
 
         
@@ -125,16 +131,16 @@ def forward(self, data_dict):
         :param data_dict: DataDict({'images',**})
         :type data_dict: ``ptp.dadatypes.DataDict``
         """
-
         # Unpack DataDict.
         enc_img = data_dict[self.key_image_encodings]
         enc_q = data_dict[self.key_question_encodings]
-        #print("\n enc_img=",enc_img)
-        #print("\n image_sketch_projection_matrix=",self.image_sketch_projection_matrix)
+
+        sketch_pm_img = self.image_sketch_projection_matrix
+        sketch_pm_q = self.question_sketch_projection_matrix
 
         # Project both batches.
-        sketch_img = enc_img.mm(self.image_sketch_projection_matrix)
-        sketch_q = enc_q.mm(self.question_sketch_projection_matrix)
+        sketch_img = enc_img.mm(sketch_pm_img)
+        sketch_q = enc_q.mm(sketch_pm_q)
 
         # Add imaginary parts (with zeros).
         sketch_img_reim = torch.stack([sketch_img, torch.zeros(sketch_img.shape).type(self.app_state.FloatTensor)], dim=2)

From 36839bc1686fe1af5edae8dec0b9965d089e3864 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sun, 5 May 2019 12:16:32 -0700
Subject: [PATCH 11/14] Fixed verions of pytorch/torchvision/torchtext

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index c3dba09..72bfa6e 100644
--- a/setup.py
+++ b/setup.py
@@ -166,9 +166,9 @@
         'nltk',
         'pandas',
         'pillow',
-        #'torchtext',
-        'torchvision',
-        'torch',
+        'torchtext==0.3.1',
+        'torchvision==0.1.9',
+        'torch==1.0.1',
         'PyYAML',
         'requests'
         ],

From 96200f83e7d387cdeb1616cc55f1e4660952a38e Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sun, 5 May 2019 12:29:34 -0700
Subject: [PATCH 12/14] torchvision version==0.2.1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 72bfa6e..b7ba573 100644
--- a/setup.py
+++ b/setup.py
@@ -167,7 +167,7 @@
         'pandas',
         'pillow',
         'torchtext==0.3.1',
-        'torchvision==0.1.9',
+        'torchvision==0.2.1',
         'torch==1.0.1',
         'PyYAML',
         'requests'

From 5482a3e49faffb14b6267c4cb3affd722bdfb21d Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sun, 5 May 2019 12:52:17 -0700
Subject: [PATCH 13/14] renamted exemplary config, added new one for
 glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss

---
 ...le_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml} |  3 ++-
 ...glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml |  2 +-
 ...love_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml} | 12 ++++++------
 .../glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml    |  2 +-
 .../glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml    |  2 +-
 .../glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml    | 12 ++++++------
 6 files changed, 17 insertions(+), 16 deletions(-)
 rename configs/vqa_med_2019/evaluation/{mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml => example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml} (99%)
 rename configs/vqa_med_2019/evaluation/{glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml => tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml} (97%)

diff --git a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
similarity index 99%
rename from configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
rename to configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
index 516df63..2f498db 100644
--- a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -38,9 +38,10 @@ hyperparameters:
   # Final classifier: FFN.
   answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [83]
 
+  # Parameters related to GPU/CPU distribution.
   batch_size: &batch_size 200
   preload_images: &preload_images True
-  num_workers: &num_workers 0
+  num_workers: &num_workers 1
 
 # Training parameters:
 training:
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml
index cad4b5b..3157f00 100644
--- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml
@@ -42,7 +42,7 @@ hyperparameters:
 
   batch_size: &batch_size 256
   preload_images: &preload_images True
-  num_workers: &num_workers 0
+  num_workers: &num_workers 1
 
 # Training parameters:
 training:
diff --git a/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml
similarity index 97%
rename from configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
rename to configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml
index 65eed52..f89a53e 100644
--- a/configs/vqa_med_2019/evaluation/glove_lstm_resnet50_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml
@@ -13,7 +13,7 @@ hyperparameters:
   # none | random_affine | random_horizontal_flip | normalize | all
 
   # Image encoder.
-  image_encoder_model: &image_encoder_model resnet50
+  image_encoder_model: &image_encoder_model resnet152
   # Options: vgg16 | densenet121 | resnet152 | resnet50
   image_encoder_output_size_val: &image_encoder_output_size_val 100
   
@@ -25,8 +25,8 @@ hyperparameters:
   question_encoder_output_size_val: &question_encoder_output_size_val 100
   
   # Fusion I: image + question
-  question_image_fusion_type_val: &question_image_fusion_type ElementWiseMultiplication
-  # Options: ElementWiseMultiplication | ? (component: question_image_fusion)
+  question_image_fusion_type_val: &question_image_fusion_type MultimodalCompactBilinearPooling
+  # Options: ElementWiseMultiplication | MultimodalCompactBilinearPooling | VQA_Attention
   question_image_fusion_size_val: &question_image_fusion_size_val 100
 
   # Image size encoder.
@@ -38,9 +38,9 @@ hyperparameters:
   # Final classifier: FFN.
   answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
 
-  batch_size: &batch_size 64
-  preload_images: &preload_images True
-  num_workers: &num_workers 0
+  batch_size: &batch_size 150
+  preload_images: &preload_images False
+  num_workers: &num_workers 3
 
 # Training parameters:
 training:
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml
index fe63930..5dde544 100644
--- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml
@@ -42,7 +42,7 @@ hyperparameters:
 
   batch_size: &batch_size 100
   preload_images: &preload_images True
-  num_workers: &num_workers 0
+  num_workers: &num_workers 1
 
 # Training parameters:
 training:
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
index 01d13e4..6e1fa71 100644
--- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -40,7 +40,7 @@ hyperparameters:
 
   batch_size: &batch_size 100
   preload_images: &preload_images True
-  num_workers: &num_workers 0
+  num_workers: &num_workers 1
 
 # Training parameters:
 training:
diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
index f7d4d89..4f52097 100644
--- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml
@@ -15,7 +15,7 @@ hyperparameters:
   # Image encoder.
   image_encoder_model: &image_encoder_model vgg16
   # Options: vgg16 | densenet121 | resnet152 | resnet50
-  image_encoder_output_size_val: &image_encoder_output_size_val 100
+  image_encoder_output_size_val: &image_encoder_output_size_val 1000
   
   # Question encoder.
   question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
@@ -27,20 +27,20 @@ hyperparameters:
   # Fusion I: image + question
   question_image_fusion_type_val: &question_image_fusion_type MultimodalCompactBilinearPooling
   # Options: ElementWiseMultiplication | MultimodalCompactBilinearPooling | 
-  question_image_fusion_size_val: &question_image_fusion_size_val 100
+  question_image_fusion_size_val: &question_image_fusion_size_val 200
 
   # Image size encoder.
   image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
 
   # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
-  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 110
+  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 210
 
   # Final classifier: FFN.
-  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
+  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [500]
 
-  batch_size: &batch_size 100
+  batch_size: &batch_size 200
   preload_images: &preload_images True
-  num_workers: &num_workers 0
+  num_workers: &num_workers 1
 
 # Training parameters:
 training:

From 17ed8d91cdd7d6deaf08fe6b27a230a9b9d253ca Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sun, 5 May 2019 13:45:47 -0700
Subject: [PATCH 14/14] default q categorization: streaming images: False,
 batch: 256

---
 .../default_question_categorization.yml                     | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/configs/vqa_med_2019/question_categorization/default_question_categorization.yml b/configs/vqa_med_2019/question_categorization/default_question_categorization.yml
index 79c56ea..4a0f255 100644
--- a/configs/vqa_med_2019/question_categorization/default_question_categorization.yml
+++ b/configs/vqa_med_2019/question_categorization/default_question_categorization.yml
@@ -5,6 +5,9 @@ training:
   problem:
     categories: all
     export_sample_weights: ~/data/vqa-med/answers.all.weights.csv
+    # Do not load and stream images!
+    stream_images: False
+    batch_size:  256
   sampler:
     weights: ~/data/vqa-med/answers.all.weights.csv
   terminal_conditions:
@@ -13,6 +16,9 @@ training:
 validation:
   problem:
     categories: all
+    # Do not load and stream images!
+    stream_images: False
+    batch_size:  256
 
 
 pipeline: