IBM · tkornuta-ibm · Apr 23, 2019 · Apr 23, 2019 · Apr 23, 2019 · Apr 23, 2019
diff --git a/configs/default/components/models/vqa/relational_network.yml b/configs/default/components/models/vqa/relational_network.yml
@@ -0,0 +1,55 @@
+# This file defines the default values for the ElementWiseMultiplication model.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+# Dropout rate (LOADED)
+# Default: 0 (means that it is turned off)
+dropout_rate: 0
+
+# Size of the output of g_theta network/output after concatenation (LOADED)
+output_size: 256
+
+streams: 
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing batch of encoded images (INPUT)
+  feature_maps: feature_maps
+
+  # Stream containing batch of encoded questions (INPUT)
+  question_encodings: question_encodings
+
+  # Stream containing outputs (OUTPUT)
+  outputs: outputs
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  # Height of the features tensor (RETRIEVED)
+  feature_maps_height: feature_maps_height
+
+  # Width of the features tensor (RETRIEVED)
+  feature_maps_width: feature_maps_width
+
+  # Depth of the features tensor (RETRIEVED)
+  feature_maps_depth: feature_maps_depth
+
+  # Size of the question encodings input (RETRIEVED)
+  question_encoding_size: question_encoding_size
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  # Size of the output (SET)
+  output_size: output_size
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml
@@ -1,6 +1,14 @@
 # Load config defining problems for training, validation and testing.
 default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
 
+# Training parameters:
+training:
+  problem:
+    batch_size: 64
+validation:
+  problem:
+    batch_size: 64
+
 pipeline:
   name: c2_classification_all_rnn_vgg16_ewm_size
 
@@ -24,8 +32,8 @@ pipeline:
   question_embeddings:
     priority: 1.2
     type: SentenceEmbeddings
-    embeddings_size: 50
-    pretrained_embeddings_file: glove.6B.50d.txt
+    embeddings_size: 100
+    pretrained_embeddings_file: glove.6B.100d.txt
     data_folder: ~/data/vqa-med
     word_mappings_file: questions.all.word.mappings.csv
     streams:
@@ -39,8 +47,9 @@ pipeline:
     cell_type: LSTM
     prediction_mode: Last
     use_logsoftmax: False
-    initial_state_trainable: False
+    initial_state_trainable: True
     hidden_size: 50
+    #dropout_rate: 0.5
     streams:
       inputs: embedded_questions
       predictions: question_activations
@@ -117,7 +126,7 @@ pipeline:
   classifier:
     priority: 5.3
     type: FeedForwardNetwork 
-    hidden_sizes: [110]
+    hidden_sizes: [100]
     dropout_rate: 0.5
     streams:
       inputs: concatenated_activations

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml
@@ -72,7 +72,6 @@ pipeline:
   question_image_fusion:
     priority: 4.1
     type: MultimodalCompactBilinearPooling
-    dropout_rate: 0.5
     streams:
       image_encodings: image_activations
       question_encodings: question_activations

diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_relational_net.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_relational_net.yml
@@ -0,0 +1,97 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    batch_size: 64
+validation:
+  problem:
+    batch_size: 64
+
+pipeline:
+  name: c2_classification_all_rnn_vgg16_relational_net
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size]
+    values: [100]
+
+  ################# PIPE 0: question #################
+  # Questions encoding.
+  question_tokenizer:
+    priority: 1.1
+    type: SentenceTokenizer
+    streams: 
+      inputs: questions
+      outputs: tokenized_questions
+
+  # Model 1: Embeddings
+  question_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 100
+    pretrained_embeddings_file: glove.6B.100d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions      
+
+  # Model 2: RNN
+  question_lstm:
+    priority: 1.3
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    prediction_mode: Last
+    use_logsoftmax: False
+    initial_state_trainable: True
+    #dropout_rate: 0.5
+    hidden_size: 50
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: image #################
+  # Image encoder.
+  image_encoder:
+    priority: 3.1
+    type: TorchVisionWrapper
+    return_feature_maps: True
+    frozen: True
+    freeze: True
+    streams:
+      inputs: images
+      outputs: feature_maps
+
+  ################# PIPE 3: fusion + classification #################
+  # Element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: RelationalNetwork
+    dropout_rate: 0.5
+    output_size: 100
+    streams:
+      question_encodings: question_activations
+      outputs: fused_image_question_activations
+    globals:
+      question_encoding_size: question_encoder_output_size
+      output_size: fused_image_question_activation_size
+
+  classifier:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [100,100]
+    dropout_rate: 0.5
+    streams:
+      inputs: fused_image_question_activations
+    globals:
+      input_size: fused_image_question_activation_size
+      prediction_size: vocabulary_size_c2
+
+  #: pipeline
diff --git a/configs/vqa_med_2019/c4_classification/c4_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c4_classification/c4_classification_all_rnn_vgg16_ewm_size.yml
@@ -0,0 +1,130 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
+
+pipeline:
+  name: c4_classification_all_rnn_vgg16_ewm_size
+
+  global_publisher:
+    priority: 0
+    type: GlobalVariablePublisher
+    # Add input_size to globals.
+    keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size,image_size_encoder_input_size, image_size_encoder_output_size]
+    values: [100, 100, 100, 2, 10]
+
+  ################# PIPE 0: question #################
+  # Questions encoding.
+  question_tokenizer:
+    priority: 1.1
+    type: SentenceTokenizer
+    streams: 
+      inputs: questions
+      outputs: tokenized_questions
+
+  # Model 1: Embeddings
+  question_embeddings:
+    priority: 1.2
+    type: SentenceEmbeddings
+    embeddings_size: 100
+    pretrained_embeddings_file: glove.6B.100d.txt
+    data_folder: ~/data/vqa-med
+    word_mappings_file: questions.all.word.mappings.csv
+    streams:
+      inputs: tokenized_questions
+      outputs: embedded_questions      
+
+  # Model 2: RNN
+  question_lstm:
+    priority: 1.3
+    type: RecurrentNeuralNetwork
+    cell_type: LSTM
+    prediction_mode: Last
+    use_logsoftmax: False
+    initial_state_trainable: True
+    hidden_size: 50
+    #dropout_rate: 0.5
+    streams:
+      inputs: embedded_questions
+      predictions: question_activations
+    globals:
+      input_size: embeddings_size
+      prediction_size: question_encoder_output_size
+
+  ################# PIPE 2: image #################
+  # Image encoder.
+  image_encoder:
+    priority: 3.1
+    type: TorchVisionWrapper
+    streams:
+      inputs: images
+      outputs: image_activations
+    globals:
+      output_size: image_encoder_output_size
+
+  ################# PIPE 3: image-question fusion  #################
+  # Element wise multiplication + FF.
+  question_image_fusion:
+    priority: 4.1
+    type: ElementWiseMultiplication
+    dropout_rate: 0.5
+    streams:
+      image_encodings: image_activations
+      question_encodings: question_activations
+      outputs: element_wise_activations
+    globals:
+      image_encoding_size: image_encoder_output_size
+      question_encoding_size: question_encoder_output_size
+      output_size: element_wise_activation_size
+
+  question_image_ffn:
+    priority: 4.2
+    type: FeedForwardNetwork 
+    hidden_sizes: [100]
+    dropout_rate: 0.5
+    streams:
+      inputs: element_wise_activations
+      predictions: question_image_activations
+    globals:
+      input_size: element_wise_activation_size
+      prediction_size: element_wise_activation_size
+
+  ################# PIPE 4: image-question-image size fusion + classification #################
+  # 2nd subpipeline: image size.
+  # Model - image size classifier.
+  image_size_encoder:
+    priority: 5.1
+    type: FeedForwardNetwork 
+    streams:
+      inputs: image_sizes
+      predictions: image_size_activations
+    globals:
+      input_size: image_size_encoder_input_size
+      prediction_size: image_size_encoder_output_size
+
+  # 4th subpipeline: concatenation + FF.
+  concat:
+    priority: 5.2
+    type: Concatenation
+    input_streams: [question_image_activations,image_size_activations]
+    # Concatenation 
+    dim: 1 # default
+    input_dims: [[-1,100],[-1,10]]
+    output_dims: [-1,110]
+    streams:
+      outputs: concatenated_activations
+    globals:
+      output_size: concatentated_activations_size
+
+
+  classifier:
+    priority: 5.3
+    type: FeedForwardNetwork 
+    hidden_sizes: [500]
+    dropout_rate: 0.5
+    streams:
+      inputs: concatenated_activations
+    globals:
+      input_size: concatentated_activations_size
+      prediction_size: vocabulary_size_c4
+
+
+  #: pipeline