Skip to content
This repository was archived by the owner on Jul 18, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion configs/default/components/models/vqa/attention.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ latent_size: 100
# Number of attention heads (LOADED)
num_attention_heads: 2

# Type of output returned
# Options: Image | Fusion
# Details: attention-weighted image | concatenation of attention-weighted image and RNN encoded question
output_mode: Fusion


streams:
####################################################################
Expand Down Expand Up @@ -42,7 +47,7 @@ globals:

# Depth of the features tensor (RETRIEVED)
feature_maps_depth: feature_maps_depth

# Size of the question encodings input (RETRIEVED)
question_encoding_size: question_encoding_size

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ globals:
# Size of the question encodings input (RETRIEVED)
question_encoding_size: question_encoding_size

# Size of the output (RETRIEVED)
output_size: output_size

####################################################################
# 4. Keymappings associated with GLOBAL variables that will be SET.
####################################################################

# Size of the output (SET)
output_size: output_size

####################################################################
# 5. Keymappings associated with statistics that will be ADDED.
####################################################################
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Load config defining problems for training, validation and testing.
default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml

training:
problem:
batch_size: 64
# Appy all preprocessing/data augmentations.
question_preprocessing: lowercase,remove_punctuation,tokenize
streams:
# Problem is returning tokenized questions.
questions: tokenized_questions

validation:
problem:
batch_size: 64
# Appy all preprocessing/data augmentations.
question_preprocessing: lowercase,remove_punctuation,tokenize
streams:
# Problem is returning tokenized questions.
questions: tokenized_questions


pipeline:

global_publisher:
priority: 0
type: GlobalVariablePublisher
# Add input_size to globals.
keys: [question_encoder_output_size, question_attention_activation_size, image_attention_activation_size, pooling_activation_size]
values: [200, 800, 4096, 512]

################# PIPE 0: question #################

# Model 1: Embeddings
question_embeddings:
priority: 1.2
type: SentenceEmbeddings
embeddings_size: 100
pretrained_embeddings_file: glove.6B.100d.txt
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions

# Model 2: RNN
question_lstm:
priority: 1.3
type: RecurrentNeuralNetwork
cell_type: GRU
prediction_mode: Dense
use_logsoftmax: False
output_last_state: False
initial_state: Trainable
dropout_rate: 0.1
hidden_size: 128
streams:
inputs: embedded_questions
predictions: question_activations
globals:
input_size: embeddings_size
prediction_size: question_encoder_output_size

# Self Attention for question.
question_attention:
priority: 1.4
type: SelfAttention
latent_size: 128
num_attention_heads: 4
streams:
question_encodings: question_activations
outputs: question_attention_activations
globals:
question_encoding_size: question_encoder_output_size
output_size: question_attention_activation_size

################# PIPE 2: image #################
# Image encoder.
image_encoder:
priority: 2.1
type: TorchVisionWrapper
model_type: resnet50
return_feature_maps: True
streams:
inputs: images
outputs: feature_maps


image_attention:
priority: 2.2
type: VQA_Attention
dropout_rate: 0.3
latent_size: 1024
output_mode: 'Image'
num_attention_heads: 2
streams:
image_encodings: feature_maps
question_encodings: question_attention_activations
outputs: image_attention_activations
globals:
question_encoding_size: question_attention_activation_size
output_size: image_attention_activation_size

################# PIPE 3: image-question fusion #################
# MFB
question_image_fusion:
priority: 3.1
type: MultimodalFactorizedBilinearPooling
dropout_rate: 0.3
latent_size: 512
pool_factor: 2
streams:
image_encodings: image_attention_activations
question_encodings: question_attention_activations
outputs: pooling_activations
globals:
image_encoding_size: image_attention_activation_size
question_encoding_size: question_attention_activation_size
output_size: pooling_activation_size

classifier:
priority: 4.1
type: FeedForwardNetwork
hidden_sizes: [100]
dropout_rate: 0.2
streams:
inputs: pooling_activations
globals:
input_size: pooling_activation_size
prediction_size: vocabulary_size_c2


#: pipeline
Loading