Skip to content
This repository was archived by the owner on Jul 18, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions configs/default/components/models/vqa/relational_network.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# This file defines the default values for the ElementWiseMultiplication model.

####################################################################
# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
####################################################################

# Dropout rate (LOADED)
# Default: 0 (means that it is turned off)
dropout_rate: 0

# Size of the output of g_theta network/output after concatenation (LOADED)
output_size: 256

streams:
####################################################################
# 2. Keymappings associated with INPUT and OUTPUT streams.
####################################################################

# Stream containing batch of encoded images (INPUT)
feature_maps: feature_maps

# Stream containing batch of encoded questions (INPUT)
question_encodings: question_encodings

# Stream containing outputs (OUTPUT)
outputs: outputs

globals:
####################################################################
# 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
####################################################################

# Height of the features tensor (RETRIEVED)
feature_maps_height: feature_maps_height

# Width of the features tensor (RETRIEVED)
feature_maps_width: feature_maps_width

# Depth of the features tensor (RETRIEVED)
feature_maps_depth: feature_maps_depth

# Size of the question encodings input (RETRIEVED)
question_encoding_size: question_encoding_size

####################################################################
# 4. Keymappings associated with GLOBAL variables that will be SET.
####################################################################

# Size of the output (SET)
output_size: output_size

####################################################################
# 5. Keymappings associated with statistics that will be ADDED.
####################################################################

Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# Load config defining problems for training, validation and testing.
default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml

# Training parameters:
training:
problem:
batch_size: 64
validation:
problem:
batch_size: 64

pipeline:
name: c2_classification_all_rnn_vgg16_ewm_size

Expand All @@ -24,8 +32,8 @@ pipeline:
question_embeddings:
priority: 1.2
type: SentenceEmbeddings
embeddings_size: 50
pretrained_embeddings_file: glove.6B.50d.txt
embeddings_size: 100
pretrained_embeddings_file: glove.6B.100d.txt
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
Expand All @@ -39,8 +47,9 @@ pipeline:
cell_type: LSTM
prediction_mode: Last
use_logsoftmax: False
initial_state_trainable: False
initial_state_trainable: True
hidden_size: 50
#dropout_rate: 0.5
streams:
inputs: embedded_questions
predictions: question_activations
Expand Down Expand Up @@ -117,7 +126,7 @@ pipeline:
classifier:
priority: 5.3
type: FeedForwardNetwork
hidden_sizes: [110]
hidden_sizes: [100]
dropout_rate: 0.5
streams:
inputs: concatenated_activations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ pipeline:
question_image_fusion:
priority: 4.1
type: MultimodalCompactBilinearPooling
dropout_rate: 0.5
streams:
image_encodings: image_activations
question_encodings: question_activations
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Load config defining problems for training, validation and testing.
default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml

# Training parameters:
training:
problem:
batch_size: 64
validation:
problem:
batch_size: 64

pipeline:
name: c2_classification_all_rnn_vgg16_relational_net

global_publisher:
priority: 0
type: GlobalVariablePublisher
# Add input_size to globals.
keys: [question_encoder_output_size]
values: [100]

################# PIPE 0: question #################
# Questions encoding.
question_tokenizer:
priority: 1.1
type: SentenceTokenizer
streams:
inputs: questions
outputs: tokenized_questions

# Model 1: Embeddings
question_embeddings:
priority: 1.2
type: SentenceEmbeddings
embeddings_size: 100
pretrained_embeddings_file: glove.6B.100d.txt
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions

# Model 2: RNN
question_lstm:
priority: 1.3
type: RecurrentNeuralNetwork
cell_type: LSTM
prediction_mode: Last
use_logsoftmax: False
initial_state_trainable: True
#dropout_rate: 0.5
hidden_size: 50
streams:
inputs: embedded_questions
predictions: question_activations
globals:
input_size: embeddings_size
prediction_size: question_encoder_output_size

################# PIPE 2: image #################
# Image encoder.
image_encoder:
priority: 3.1
type: TorchVisionWrapper
return_feature_maps: True
frozen: True
freeze: True
streams:
inputs: images
outputs: feature_maps

################# PIPE 3: fusion + classification #################
# Element wise multiplication + FF.
question_image_fusion:
priority: 4.1
type: RelationalNetwork
dropout_rate: 0.5
output_size: 100
streams:
question_encodings: question_activations
outputs: fused_image_question_activations
globals:
question_encoding_size: question_encoder_output_size
output_size: fused_image_question_activation_size

classifier:
priority: 4.2
type: FeedForwardNetwork
hidden_sizes: [100,100]
dropout_rate: 0.5
streams:
inputs: fused_image_question_activations
globals:
input_size: fused_image_question_activation_size
prediction_size: vocabulary_size_c2

#: pipeline
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Load config defining problems for training, validation and testing.
default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml

pipeline:
name: c4_classification_all_rnn_vgg16_ewm_size

global_publisher:
priority: 0
type: GlobalVariablePublisher
# Add input_size to globals.
keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size,image_size_encoder_input_size, image_size_encoder_output_size]
values: [100, 100, 100, 2, 10]

################# PIPE 0: question #################
# Questions encoding.
question_tokenizer:
priority: 1.1
type: SentenceTokenizer
streams:
inputs: questions
outputs: tokenized_questions

# Model 1: Embeddings
question_embeddings:
priority: 1.2
type: SentenceEmbeddings
embeddings_size: 100
pretrained_embeddings_file: glove.6B.100d.txt
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions

# Model 2: RNN
question_lstm:
priority: 1.3
type: RecurrentNeuralNetwork
cell_type: LSTM
prediction_mode: Last
use_logsoftmax: False
initial_state_trainable: True
hidden_size: 50
#dropout_rate: 0.5
streams:
inputs: embedded_questions
predictions: question_activations
globals:
input_size: embeddings_size
prediction_size: question_encoder_output_size

################# PIPE 2: image #################
# Image encoder.
image_encoder:
priority: 3.1
type: TorchVisionWrapper
streams:
inputs: images
outputs: image_activations
globals:
output_size: image_encoder_output_size

################# PIPE 3: image-question fusion #################
# Element wise multiplication + FF.
question_image_fusion:
priority: 4.1
type: ElementWiseMultiplication
dropout_rate: 0.5
streams:
image_encodings: image_activations
question_encodings: question_activations
outputs: element_wise_activations
globals:
image_encoding_size: image_encoder_output_size
question_encoding_size: question_encoder_output_size
output_size: element_wise_activation_size

question_image_ffn:
priority: 4.2
type: FeedForwardNetwork
hidden_sizes: [100]
dropout_rate: 0.5
streams:
inputs: element_wise_activations
predictions: question_image_activations
globals:
input_size: element_wise_activation_size
prediction_size: element_wise_activation_size

################# PIPE 4: image-question-image size fusion + classification #################
# 2nd subpipeline: image size.
# Model - image size classifier.
image_size_encoder:
priority: 5.1
type: FeedForwardNetwork
streams:
inputs: image_sizes
predictions: image_size_activations
globals:
input_size: image_size_encoder_input_size
prediction_size: image_size_encoder_output_size

# 4th subpipeline: concatenation + FF.
concat:
priority: 5.2
type: Concatenation
input_streams: [question_image_activations,image_size_activations]
# Concatenation
dim: 1 # default
input_dims: [[-1,100],[-1,10]]
output_dims: [-1,110]
streams:
outputs: concatenated_activations
globals:
output_size: concatentated_activations_size


classifier:
priority: 5.3
type: FeedForwardNetwork
hidden_sizes: [500]
dropout_rate: 0.5
streams:
inputs: concatenated_activations
globals:
input_size: concatentated_activations_size
prediction_size: vocabulary_size_c4


#: pipeline
Loading