diff --git a/configs/default/components/models/sentence_embeddings.yml b/configs/default/components/models/sentence_embeddings.yml index 0056849..5feccd7 100644 --- a/configs/default/components/models/sentence_embeddings.yml +++ b/configs/default/components/models/sentence_embeddings.yml @@ -13,7 +13,7 @@ source_vocabulary_files: '' # Additional tokens that will be added to vocabulary (LOADED) # This list can be extended, but and are special tokens. # is ALWAYS used for padding shorter sequences. -additional_tokens: ',' +additional_tokens: '' # Enable (end of sequence) token. eos_token: False diff --git a/configs/default/components/models/vqa/attention.yml b/configs/default/components/models/vqa/attention.yml new file mode 100644 index 0000000..830f4b8 --- /dev/null +++ b/configs/default/components/models/vqa/attention.yml @@ -0,0 +1,58 @@ +# This file defines the default values for the VQA_Attention model. + +#################################################################### +# 1. CONFIGURATION PARAMETERS that will be LOADED by the component. +#################################################################### + +# Dropout rate (LOADED) +# Default: 0 (means that it is turned off) +dropout_rate: 0 + +# Size of the latent space (LOADED) +latent_size: 100 + +# Number of attention heads (LOADED) +num_attention_heads: 2 + + +streams: + #################################################################### + # 2. Keymappings associated with INPUT and OUTPUT streams. + #################################################################### + + # Stream containing batch of encoded images (INPUT) + feature_maps: feature_maps + + # Stream containing batch of encoded questions (INPUT) + question_encodings: question_encodings + + # Stream containing outputs (OUTPUT) + outputs: outputs + +globals: + #################################################################### + # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. + #################################################################### + + # Height of the features tensor (RETRIEVED) + feature_maps_height: feature_maps_height + + # Width of the features tensor (RETRIEVED) + feature_maps_width: feature_maps_width + + # Depth of the features tensor (RETRIEVED) + feature_maps_depth: feature_maps_depth + + # Size of the question encodings input (RETRIEVED) + question_encoding_size: question_encoding_size + + # Size of the output (RETRIEVED) + output_size: output_size + + #################################################################### + # 4. Keymappings associated with GLOBAL variables that will be SET. + #################################################################### + + #################################################################### + # 5. Keymappings associated with statistics that will be ADDED. + #################################################################### diff --git a/configs/default/components/publishers/bleu_statistics.yml b/configs/default/components/publishers/bleu_statistics.yml index a79a245..c51f387 100644 --- a/configs/default/components/publishers/bleu_statistics.yml +++ b/configs/default/components/publishers/bleu_statistics.yml @@ -13,6 +13,9 @@ use_prediction_distributions: True # TODO! #use_masking: False +# Ignored words - useful for ignoring special tokens +ignored_words: ["", ""] + # Weights of n-grams used when calculating the score. weights: [0.25, 0.25, 0.25, 0.25] diff --git a/configs/default/components/publishers/stream_file_exporter.yml b/configs/default/components/publishers/stream_file_exporter.yml index 1a5546f..3d83b20 100644 --- a/configs/default/components/publishers/stream_file_exporter.yml +++ b/configs/default/components/publishers/stream_file_exporter.yml @@ -11,6 +11,9 @@ input_streams: '' # Separator that will be placed between values (LOADED) separator: ',' +# Adds additional line to output file enabling Excel to use different separator while loading (LOADED) +export_separator_line_to_csv: False + # Name of the file containing output values (LOADED) filename: 'outputs.txt' diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_ewm_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_ewm_cat_is.yml index 0bce435..b27aea1 100644 --- a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_ewm_cat_is.yml +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_ewm_cat_is.yml @@ -5,9 +5,9 @@ training: problem: batch_size: 48 # Appy all preprocessing/data augmentations. - image_preprocessing: all + image_preprocessing: normalize # none | random_affine | random_horizontal_flip | normalize | all - question_preprocessing: all + question_preprocessing: lowercase,remove_punctuation,tokenize # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all streams: # Problem is returning tokenized questions. @@ -24,7 +24,6 @@ validation: pipeline: - name: c2_class_lstm_resnet152_ewm_cat_is global_publisher: priority: 0 @@ -96,6 +95,7 @@ pipeline: type: FeedForwardNetwork hidden_sizes: [100] dropout_rate: 0.5 + use_logsoftmax: False streams: inputs: element_wise_activations predictions: question_image_activations diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_rn_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_rn_cat_is.yml index d9020d2..1a1a05c 100644 --- a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_rn_cat_is.yml +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet152_rn_cat_is.yml @@ -5,9 +5,9 @@ training: problem: batch_size: 32 # Appy all preprocessing/data augmentations. - image_preprocessing: all + image_preprocessing: normalize # none | random_affine | random_horizontal_flip | normalize | all - question_preprocessing: all + question_preprocessing: lowercase,remove_punctuation,tokenize # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all streams: # Problem is returning tokenized questions. @@ -24,7 +24,6 @@ validation: pipeline: - name: c2_class_lstm_resnet152_rn_cat_is global_publisher: priority: 0 diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_attn_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_attn_cat_is.yml new file mode 100644 index 0000000..08b043e --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_attn_cat_is.yml @@ -0,0 +1,102 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + problem: + batch_size: 48 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + +validation: + problem: + batch_size: 48 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + + +pipeline: + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, attention_activation_size, question_image_activation_size] + values: [100, 4196, 300] + + ################# PIPE 0: question ################# + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 100 + pretrained_embeddings_file: glove.6B.100d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + use_logsoftmax: False + initial_state: Trainable + dropout_rate: 0.1 + hidden_size: 50 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 3.1 + type: TorchVisionWrapper + model_type: resnet50 + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + ################# PIPE 3: image-question fusion ################# + # Attention + FF. + question_image_fusion: + priority: 4.1 + type: VQA_Attention + dropout_rate: 0.5 + latent_size: 100 + num_attention_heads: 2 + streams: + image_encodings: feature_maps + question_encodings: question_activations + outputs: attention_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: attention_activation_size + + classifier: + priority: 5.1 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + streams: + inputs: attention_activations + globals: + input_size: attention_activation_size + prediction_size: vocabulary_size_c2 + + + #: pipeline diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_ewm_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_ewm_cat_is.yml new file mode 100644 index 0000000..2db4248 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_ewm_cat_is.yml @@ -0,0 +1,142 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + problem: + batch_size: 48 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + +validation: + problem: + batch_size: 48 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + + +pipeline: + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_encoder_output_size, element_wise_activation_size,image_size_encoder_input_size, image_size_encoder_output_size] + values: [100, 100, 100, 2, 10] + + ################# PIPE 0: question ################# + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 100 + pretrained_embeddings_file: glove.6B.100d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + use_logsoftmax: False + initial_state: Trainable + dropout_rate: 0.1 + hidden_size: 50 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 3.1 + type: TorchVisionWrapper + model_type: resnet50 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: image-question fusion ################# + # Element wise multiplication + FF. + question_image_fusion: + priority: 4.1 + type: ElementWiseMultiplication + dropout_rate: 0.5 + streams: + image_encodings: image_activations + question_encodings: question_activations + outputs: element_wise_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: element_wise_activation_size + + question_image_ffn: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + use_logsoftmax: False + streams: + inputs: element_wise_activations + predictions: question_image_activations + globals: + input_size: element_wise_activation_size + prediction_size: element_wise_activation_size + + ################# PIPE 5: image-question-image size fusion + classification ################# + # Model - image size FFN. + image_size_encoder: + priority: 5.1 + type: FeedForwardNetwork + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + # 4th subpipeline: concatenation + FF. + concat: + priority: 5.2 + type: Concatenation + input_streams: [question_image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,10]] + output_dims: [-1,110] + streams: + outputs: concatenated_activations + globals: + output_size: concatentated_activations_size + + + classifier: + priority: 5.3 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + globals: + input_size: concatentated_activations_size + prediction_size: vocabulary_size_c2 + + + #: pipeline diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_rn_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_rn_cat_is.yml new file mode 100644 index 0000000..51dd275 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_rn_cat_is.yml @@ -0,0 +1,141 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +training: + problem: + batch_size: 32 + # Appy all preprocessing/data augmentations. + image_preprocessing: normalize + # none | random_affine | random_horizontal_flip | normalize | all + question_preprocessing: lowercase,remove_punctuation,tokenize + # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + +validation: + problem: + batch_size: 32 + question_preprocessing: lowercase,remove_punctuation,tokenize + # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all + streams: + # Problem is returning tokenized questions. + questions: tokenized_questions + + +pipeline: + + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size,rn_activation_size,image_size_encoder_input_size, image_size_encoder_output_size] + values: [100, 100, 2, 10] + + ################# PIPE 0: question ################# + + # Model 1: Embeddings + question_embeddings: + priority: 1.2 + type: SentenceEmbeddings + embeddings_size: 100 + pretrained_embeddings_file: glove.6B.100d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + question_lstm: + priority: 1.3 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + use_logsoftmax: False + initial_state: Trainable + dropout_rate: 0.1 + hidden_size: 50 + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + ################# PIPE 2: image ################# + # Image encoder. + image_encoder: + priority: 3.1 + type: TorchVisionWrapper + model_type: resnet50 + return_feature_maps: True + streams: + inputs: images + outputs: feature_maps + + ################# PIPE 3: Fusion: Relational Network ################# + # Object-object relations. + question_image_fusion: + priority: 4.1 + type: RelationalNetwork + dropout_rate: 0.5 + g_theta_sizes: [512, 256] + streams: + question_encodings: question_activations + outputs: fused_image_question_activations + globals: + question_encoding_size: question_encoder_output_size + output_size: fused_image_question_activation_size + + question_image_ffn: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [128,100] + dropout_rate: 0.5 + streams: + inputs: fused_image_question_activations + predictions: rn_activation + globals: + input_size: fused_image_question_activation_size + prediction_size: rn_activation_size + + + ################# PIPE 5: image-question-image size fusion + classification ################# + # Model - image size FFN. + image_size_encoder: + priority: 5.1 + type: FeedForwardNetwork + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + # 6th subpipeline: concatenation + FF. + concat: + priority: 5.2 + type: Concatenation + input_streams: [rn_activation,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,10]] + output_dims: [-1,110] + streams: + outputs: concatenated_activations + globals: + output_size: concatentated_activations_size + + classifier: + priority: 5.3 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + globals: + input_size: concatentated_activations_size + prediction_size: vocabulary_size_c2 + + #: pipeline diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn.yml index 4991b84..14e4de2 100644 --- a/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn.yml +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn.yml @@ -4,9 +4,9 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml training: problem: # Appy all preprocessing/data augmentations. - image_preprocessing: all + image_preprocessing: normalize # none | random_affine | random_horizontal_flip | normalize | all - question_preprocessing: all + question_preprocessing: lowercase,remove_punctuation,tokenize # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all streams: # Problem is returning tokenized questions. @@ -22,7 +22,6 @@ validation: pipeline: - name: c2_class_lstm_vgg16_rn global_publisher: priority: 0 diff --git a/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn_cat_is.yml b/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn_cat_is.yml index c97870b..22c25e4 100644 --- a/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn_cat_is.yml +++ b/configs/vqa_med_2019/c2_classification/c2_class_lstm_vgg16_rn_cat_is.yml @@ -4,9 +4,9 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml training: problem: # Appy all preprocessing/data augmentations. - image_preprocessing: all + image_preprocessing: normalize # none | random_affine | random_horizontal_flip | normalize | all - question_preprocessing: all + question_preprocessing: lowercase,remove_punctuation,tokenize # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all streams: # Problem is returning tokenized questions. @@ -22,7 +22,6 @@ validation: pipeline: - name: c2_class_lstm_vgg16_rn_cat_is global_publisher: priority: 0 diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml index d3aa792..51fba8d 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_concat.yml @@ -2,7 +2,6 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml pipeline: - name: vqa_med_c2_classification_all_rnn_vgg_concat global_publisher: priority: 0 diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml index 84c8bf8..5447526 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm.yml @@ -2,7 +2,6 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml pipeline: - name: c2_classification_all_rnn_vgg16_ewm global_publisher: priority: 0 diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml index 7db3a3c..1a1f774 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_ewm_size.yml @@ -2,7 +2,6 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml pipeline: - name: c2_classification_all_rnn_vgg16_ewm_size global_publisher: priority: 0 @@ -80,6 +79,7 @@ pipeline: type: FeedForwardNetwork hidden_sizes: [100] dropout_rate: 0.5 + use_logsoftmax: False streams: inputs: element_wise_activations predictions: question_image_activations diff --git a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml index cabc1dc..d28a24f 100644 --- a/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml +++ b/configs/vqa_med_2019/c2_classification/c2_classification_all_rnn_vgg16_mcb.yml @@ -2,7 +2,6 @@ default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml pipeline: - name: c2_classification_all_rnn_vgg16_mcb global_publisher: priority: 0 diff --git a/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml index 73dcce7..2d28708 100644 --- a/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml +++ b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml @@ -14,7 +14,6 @@ validation: batch_size: 128 pipeline: - name: c2_word_answer_onehot_bow # Answer encoding. answer_tokenizer: diff --git a/configs/vqa_med_2019/c2_classification/default_c2_classification.yml b/configs/vqa_med_2019/c2_classification/default_c2_classification.yml index 68f5880..9511a28 100644 --- a/configs/vqa_med_2019/c2_classification/default_c2_classification.yml +++ b/configs/vqa_med_2019/c2_classification/default_c2_classification.yml @@ -82,6 +82,6 @@ pipeline: viewer: type: StreamViewer priority: 100.4 - input_streams: questions,category_names,answers,predicted_answers + input_streams: tokenized_questions,category_names,answers,predicted_answers #: pipeline diff --git a/configs/vqa_med_2019/default_vqa_med_2019.yml b/configs/vqa_med_2019/default_vqa_med_2019.yml index 11d7222..dfe01a6 100644 --- a/configs/vqa_med_2019/default_vqa_med_2019.yml +++ b/configs/vqa_med_2019/default_vqa_med_2019.yml @@ -22,7 +22,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-2 + loss_stop: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/default_extend_answers.yml b/configs/vqa_med_2019/extend_answers.yml similarity index 55% rename from configs/vqa_med_2019/default_extend_answers.yml rename to configs/vqa_med_2019/extend_answers.yml index 270d5d1..9e2f9a4 100644 --- a/configs/vqa_med_2019/default_extend_answers.yml +++ b/configs/vqa_med_2019/extend_answers.yml @@ -9,6 +9,10 @@ training_answers: categories: all resize_image: &resize_image [224, 224] batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions dataloader: # No sampler, process samples in the same order. shuffle: false @@ -22,6 +26,10 @@ validation_answers: split: validation resize_image: *resize_image batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions dataloader: # No sampler, process samples in the same order. shuffle: false @@ -37,6 +45,10 @@ test_answers: split: test resize_image: *resize_image batch_size: 64 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions dataloader: # No sampler, process samples in the same order. shuffle: false @@ -45,18 +57,35 @@ test_answers: # Add component for exporting answers to files. pipeline: - disable: viewer -# # Viewers. + disable: viewer,question_tokenizer + # Viewers. viewer_extended: priority: 100.4 type: StreamViewer sample_number: 0 - input_streams: indices,image_ids,questions,category_names,predicted_categories,answers,tokenized_answers,predicted_answers + input_streams: + indices,image_ids,tokenized_questions, + category_names,predicted_categories, + answers,tokenized_answers,predicted_answers - exporter: + answer_exporter: priority: 100.5 type: StreamFileExporter separator: '|' - input_streams: indices,image_ids,questions,category_names,predicted_categories,answers,tokenized_answers,predicted_answers + filename: 'answers.csv' + export_separator_line_to_csv: True + input_streams: + indices,image_ids,tokenized_questions, + category_names,predicted_categories, + answers,tokenized_answers,predicted_answers + + submission_exporter: + priority: 100.6 + type: StreamFileExporter + separator: '|' + filename: 'submission.txt' + input_streams: + image_ids, + predicted_answers #: pipeline diff --git a/configs/vqa_med_2019/vf/lstm_resnet152_is_cat_ffn_c123_no_binary_loss.yml b/configs/vqa_med_2019/vf/lstm_resnet152_is_cat_ffn_c123_no_binary_loss.yml new file mode 100644 index 0000000..5a541f2 --- /dev/null +++ b/configs/vqa_med_2019/vf/lstm_resnet152_is_cat_ffn_c123_no_binary_loss.yml @@ -0,0 +1,307 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + +# Validation parameters: +validation: + problem: + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + type: GlobalVariablePublisher + priority: 0 + # Add input_size to globals. + keys: [question_lstm_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, category_c123_without_yn_word_to_ix] + values: [100, 2, 10, 100, {"C1": 0, "C2": 1, "C3": 2}] + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 0.1 + + ################# PIPE 0: CATEGORY ################# + + # Model 1: question embeddings + pipe0_question_embeddings: + type: SentenceEmbeddings + priority: 0.3 + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + + # Model 2: question RNN + pipe0_lstm: + priority: 0.4 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.5 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + dropout_rate: 0.5 + streams: + inputs: pipe0_questions_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: question_lstm_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + type: AccuracyStatistics + priority: 0.7 + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + type: SentenceEmbeddings + priority: 1.1 + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: embedded_questions + predictions: questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Answer encoding + pipe1_all_answer_indexer: + type: LabelIndexer + priority: 1.3 + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: all_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + type: TorchVisionWrapper + model: resnet152 + priority: 2.1 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + type: FeedForwardNetwork + priority: 3.1 + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 4: SHARED CONCAT ################# + + concat: + type: Concatenation + priority: 4.1 + input_streams: [questions_activations,image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,100],[-1,10]] + output_dims: [-1,210] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + + ################# PIPE 5: C1 + C2 + C3 questions ################# + + # Answer encoding for PIPE 5. + pipe5_c123_without_yn_answer_indexer: + type: LabelIndexer + priority: 5.1 + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe5_c123_without_yn_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + # Sample masking based on categories. + pipe5_c123_without_yn_string_to_mask: + priority: 5.2 + type: StringToMask + globals: + word_mappings: category_c123_without_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe5_c123_without_yn_masks + + # Model 4: FFN C1 answering + pipe5_c123_without_yn_ffn: + priority: 5.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe5_c123_without_yn_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_without_yn + + pipe5_c123_without_yn_nllloss: + type: NLLLoss + priority: 5.4 + targets_dim: 1 + use_masking: True + streams: + predictions: pipe5_c123_without_yn_predictions + masks: pipe5_c123_without_yn_masks + targets: pipe5_c123_without_yn_answers_ids + loss: pipe5_c123_without_yn_loss + + pipe5_c123_without_yn_precision_recall: + type: PrecisionRecallStatistics + priority: 5.5 + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe5_c123_without_yn_masks + predictions: pipe5_c123_without_yn_predictions + targets: pipe5_c123_without_yn_answers_ids + globals: + word_mappings: word_mappings_c123_without_yn + statistics: + precision: pipe5_c123_without_yn_precision + recall: pipe5_c123_without_yn_recall + f1score: pipe5_c123_without_yn_f1score + + # C123 Predictions decoder. + pipe5_prediction_decoder: + type: WordDecoder + priority: 5.6 + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe5_c123_without_yn_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 9: MERGE ANSWERS ################# + + + # Viewers. + viewer: + type: StreamViewer + priority: 9.3 + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe5_c123_without_yn_masks, + answers, predicted_answers + + +#: pipeline diff --git a/configs/vqa_med_2019/vf/lstm_resnet50_ewm_is_cat_ffn_c123_loss_ffn_yn_loss.yml b/configs/vqa_med_2019/vf/lstm_resnet50_ewm_is_cat_ffn_c123_loss_ffn_yn_loss.yml new file mode 100644 index 0000000..1b3f29d --- /dev/null +++ b/configs/vqa_med_2019/vf/lstm_resnet50_ewm_is_cat_ffn_c123_loss_ffn_yn_loss.yml @@ -0,0 +1,454 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + +# Validation parameters: +validation: + problem: + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, element_wise_activation_size, category_c123_without_yn_word_to_ix,category_binary_yn_word_to_ix] + values: [100, 2, 10, 100, 100, {"C1": 0, "C2": 1, "C3": 2}, {"BINARY": 3}] + + # Statistics. + batch_size: + priority: 0.1 + type: BatchSizeStatistics + + ################# PIPE 0: CATEGORY ################# + + # Model 1: question embeddings + pipe0_question_embeddings: + priority: 0.3 + type: SentenceEmbeddings + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + + # Model 2: question RNN + pipe0_lstm: + priority: 0.4 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.5 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + dropout_rate: 0.5 + streams: + inputs: pipe0_question_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: question_encoder_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + priority: 0.7 + type: AccuracyStatistics + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + priority: 1.1 + type: SentenceEmbeddings + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # Answer encoding + pipe1_all_answer_indexer: + priority: 1.3 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: all_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_binary_yn + word_mappings: word_mappings_c123_binary_yn + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model: resnet50 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + priority: 3.1 + type: FeedForwardNetwork + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 4: image-question fusion ################# + # Element wise multiplication + FF. + question_image_fusion: + priority: 4.1 + type: ElementWiseMultiplication + dropout_rate: 0.5 + streams: + image_encodings: image_activations + question_encodings: question_activations + outputs: element_wise_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: element_wise_activation_size + + question_image_ffn: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + use_logsoftmax: False + streams: + inputs: element_wise_activations + predictions: question_image_activations + globals: + input_size: element_wise_activation_size + prediction_size: element_wise_activation_size + + ################# PIPE 5: image-question-image size fusion ################# + + # 5th subpipeline: concatenation + concat: + priority: 5.1 + type: Concatenation + input_streams: [question_image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,10]] + output_dims: [-1,110] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + ################# PIPE 6: C1 + C2 + C3 questions ################# + + # Answer encoding for PIPE 6. + pipe6_c123_answer_indexer: + priority: 6.1 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe6_c123_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + # Sample masking based on categories. + pipe6_c123_string_to_mask: + priority: 6.2 + type: StringToMask + globals: + word_mappings: category_c123_without_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe6_c123_masks + + # Model 4: FFN C123 answering + pipe6_c123_answer_classifier: + priority: 6.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe6_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_without_yn + + pipe6_c123_nllloss: + priority: 6.4 + type: NLLLoss + targets_dim: 1 + use_masking: True + streams: + predictions: pipe6_c123_predictions + masks: pipe6_c123_masks + targets: pipe6_c123_answers_ids + loss: pipe6_c123_loss + + pipe6_c123_precision_recall: + priority: 6.5 + type: PrecisionRecallStatistics + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe6_c123_masks + predictions: pipe6_c123_predictions + targets: pipe6_c123_answers_ids + globals: + word_mappings: word_mappings_c123_without_yn + statistics: + precision: pipe6_c123_precision + recall: pipe6_c123_recall + f1score: pipe6_c123_f1score + + # C123 Predictions decoder. + #pipe5_c123_prediction_decoder: + # priority: 6.6 + # type: WordDecoder + # # Use the same word mappings as label indexer. + # import_word_mappings_from_globals: True + # streams: + # inputs: pipe6_c123_predictions + # outputs: pipe6_c123_predicted_answers + # globals: + # word_mappings: word_mappings_c123_without_yn + + + ################# PIPE 7: Y/N questions ################# + + # Answer encoding for PIPE 5. + pipe7_binary_yn_answer_indexer: + type: LabelIndexer + priority: 7.1 + data_folder: ~/data/vqa-med + word_mappings_file: answers.binary_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe7_binary_yn_answers_ids + globals: + vocabulary_size: vocabulary_size_binary_yn + word_mappings: word_mappings_binary_yn + + # Sample masking based on categories. + pipe7_binary_yn_string_to_mask: + priority: 7.2 + type: StringToMask + globals: + word_mappings: category_binary_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_binary_question_categories_indices # NOT USED + masks: pipe7_binary_yn_masks + + # Model 4: FFN C1 answering + pipe7_binary_yn_classifier: + priority: 7.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe7_binary_yn_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_binary_yn + + pipe7_binary_yn_nllloss: + type: NLLLoss + priority: 7.4 + targets_dim: 1 + use_masking: True + streams: + predictions: pipe7_binary_yn_predictions + masks: pipe7_binary_yn_masks + targets: pipe7_binary_yn_answers_ids + loss: pipe7_binary_yn_loss + + pipe7_binary_yn_precision_recall: + type: PrecisionRecallStatistics + priority: 7.5 + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe7_binary_yn_masks + predictions: pipe7_binary_yn_predictions + targets: pipe7_binary_yn_answers_ids + globals: + word_mappings: word_mappings_binary_yn + statistics: + precision: pipe7_binary_yn_precision + recall: pipe7_binary_yn_recall + f1score: pipe7_binary_yn_f1score + + # Y/N Predictions decoder. + #pipe7_binary_yn_prediction_decoder: + # type: WordDecoder + # priority: 7.6 + # # Use the same word mappings as label indexer. + # import_word_mappings_from_globals: True + # streams: + # inputs: pipe7_binary_yn_predictions + # outputs: pipe7_binary_yn_predicted_answers + # globals: + # word_mappings: word_mappings_binary_yn + + + ################# PIPE 9: MERGE ANSWERS ################# + + # Merge predictions + pipe8_merged_predictions: + type: JoinMaskedPredictions + priority: 8.1 + # Names of used input streams. + input_prediction_streams: [pipe6_c123_predictions, pipe7_binary_yn_predictions] + input_mask_streams: [pipe6_c123_masks, pipe7_binary_yn_masks] + input_word_mappings: [word_mappings_c123_without_yn, word_mappings_binary_yn] + globals: + output_word_mappings: word_mappings_c123_binary_yn + streams: + output_strings: predicted_answers + output_indices: pipe8_merged_pred_indices + + # Statistics. + pipe8_merged_precision_recall: + type: PrecisionRecallStatistics + priority: 8.2 + # Use prediction indices instead of distributions. + use_prediction_distributions: False + use_word_mappings: True + show_class_scores: True + show_confusion_matrix: True + globals: + word_mappings: word_mappings_c123_binary_yn + streams: + targets: all_answers_ids + predictions: pipe8_merged_pred_indices + statistics: + precision: pipe8_merged_precision + recall: pipe8_merged_recall + f1score: pipe8_merged_f1score + + + + + # Viewers. + viewer: + priority: 9.3 + type: StreamViewer + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe6_c123_masks, + answers, predicted_answers + + +#: pipeline diff --git a/configs/vqa_med_2019/vf/lstm_resnet50_ewm_is_cat_ffn_c123_no_binary_loss.yml b/configs/vqa_med_2019/vf/lstm_resnet50_ewm_is_cat_ffn_c123_no_binary_loss.yml new file mode 100644 index 0000000..2a46463 --- /dev/null +++ b/configs/vqa_med_2019/vf/lstm_resnet50_ewm_is_cat_ffn_c123_no_binary_loss.yml @@ -0,0 +1,334 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + +# Validation parameters: +validation: + problem: + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + priority: 0 + type: GlobalVariablePublisher + # Add input_size to globals. + keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, element_wise_activation_size, category_c123_without_yn_word_to_ix] + values: [100, 2, 10, 100, 100, {"C1": 0, "C2": 1, "C3": 2}] + + # Statistics. + batch_size: + priority: 0.1 + type: BatchSizeStatistics + + ################# PIPE 0: CATEGORY ################# + + # Model 1: question embeddings + pipe0_question_embeddings: + priority: 0.3 + type: SentenceEmbeddings + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + + # Model 2: question RNN + pipe0_lstm: + priority: 0.4 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.5 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + dropout_rate: 0.5 + streams: + inputs: pipe0_question_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: question_encoder_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + priority: 0.7 + type: AccuracyStatistics + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + priority: 1.1 + type: SentenceEmbeddings + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: embedded_questions + predictions: question_activations + globals: + input_size: embeddings_size + prediction_size: question_encoder_output_size + + # Answer encoding + pipe1_all_answer_indexer: + priority: 1.3 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: all_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + priority: 2.1 + type: TorchVisionWrapper + model: resnet50 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + priority: 3.1 + type: FeedForwardNetwork + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 4: image-question fusion ################# + # Element wise multiplication + FF. + question_image_fusion: + priority: 4.1 + type: ElementWiseMultiplication + dropout_rate: 0.5 + streams: + image_encodings: image_activations + question_encodings: question_activations + outputs: element_wise_activations + globals: + image_encoding_size: image_encoder_output_size + question_encoding_size: question_encoder_output_size + output_size: element_wise_activation_size + + question_image_ffn: + priority: 4.2 + type: FeedForwardNetwork + hidden_sizes: [100] + dropout_rate: 0.5 + use_logsoftmax: False + streams: + inputs: element_wise_activations + predictions: question_image_activations + globals: + input_size: element_wise_activation_size + prediction_size: element_wise_activation_size + + ################# PIPE 5: image-question-image size fusion ################# + + # 5th subpipeline: concatenation + concat: + priority: 5.1 + type: Concatenation + input_streams: [question_image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,10]] + output_dims: [-1,110] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + ################# PIPE 6: C1 + C2 + C3 questions ################# + + # Answer encoding for PIPE 6. + pipe6_c123_answer_indexer: + priority: 6.1 + type: LabelIndexer + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe6_c123_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + # Sample masking based on categories. + pipe6_c123_string_to_mask: + priority: 6.2 + type: StringToMask + globals: + word_mappings: category_c123_without_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe6_c123_masks + + # Model 4: FFN C123 answering + pipe6_c123_answer_classifier: + priority: 6.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe6_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_without_yn + + pipe6_c123_nllloss: + priority: 6.4 + type: NLLLoss + targets_dim: 1 + use_masking: True + streams: + predictions: pipe6_c123_predictions + masks: pipe6_c123_masks + targets: pipe6_c123_answers_ids + loss: pipe6_c123_loss + + pipe6_c123_precision_recall: + priority: 6.5 + type: PrecisionRecallStatistics + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe6_c123_masks + predictions: pipe6_c123_predictions + targets: pipe6_c123_answers_ids + globals: + word_mappings: word_mappings_c123_without_yn + statistics: + precision: pipe6_c123_precision + recall: pipe6_c123_recall + f1score: pipe6_c123_f1score + + # C123 Predictions decoder. + pipe5_c123_prediction_decoder: + priority: 6.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe6_c123_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 9: MERGE ANSWERS ################# + + # Viewers. + viewer: + priority: 9.3 + type: StreamViewer + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe6_c123_masks, + answers, predicted_answers + + +#: pipeline diff --git a/configs/vqa_med_2019/vf/lstm_resnet50_is_cat_ffn_c123_no_binary_loss.yml b/configs/vqa_med_2019/vf/lstm_resnet50_is_cat_ffn_c123_no_binary_loss.yml new file mode 100644 index 0000000..2364e06 --- /dev/null +++ b/configs/vqa_med_2019/vf/lstm_resnet50_is_cat_ffn_c123_no_binary_loss.yml @@ -0,0 +1,307 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + +# Validation parameters: +validation: + problem: + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + type: GlobalVariablePublisher + priority: 0 + # Add input_size to globals. + keys: [question_lstm_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, category_c123_without_yn_word_to_ix] + values: [100, 2, 10, 100, {"C1": 0, "C2": 1, "C3": 2}] + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 0.1 + + ################# PIPE 0: CATEGORY ################# + + # Model 1: question embeddings + pipe0_question_embeddings: + type: SentenceEmbeddings + priority: 0.3 + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + + # Model 2: question RNN + pipe0_lstm: + priority: 0.4 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.5 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + dropout_rate: 0.5 + streams: + inputs: pipe0_questions_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: question_lstm_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + type: AccuracyStatistics + priority: 0.7 + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + type: SentenceEmbeddings + priority: 1.1 + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: embedded_questions + predictions: questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Answer encoding + pipe1_all_answer_indexer: + type: LabelIndexer + priority: 1.3 + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: all_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + type: TorchVisionWrapper + model: resnet50 + priority: 2.1 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + type: FeedForwardNetwork + priority: 3.1 + use_losfotmax: False + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 4: SHARED CONCAT ################# + + concat: + type: Concatenation + priority: 4.1 + input_streams: [questions_activations,image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,100],[-1,10]] + output_dims: [-1,210] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + + ################# PIPE 5: C1 + C2 + C3 questions ################# + + # Answer encoding for PIPE 5. + pipe5_c123_answer_indexer: + type: LabelIndexer + priority: 5.1 + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe5_c123_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + # Sample masking based on categories. + pipe5_c123_string_to_mask: + priority: 5.2 + type: StringToMask + globals: + word_mappings: category_c123_without_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe5_c123_masks + + # Model 4: FFN C1 answering + pipe5_c123_ffn: + priority: 5.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe5_c123_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_without_yn + + pipe5_c123_nllloss: + type: NLLLoss + priority: 5.4 + targets_dim: 1 + use_masking: True + streams: + predictions: pipe5_c123_predictions + masks: pipe5_c123_masks + targets: pipe5_c123_answers_ids + loss: pipe5_c123_loss + + pipe5_c123_precision_recall: + type: PrecisionRecallStatistics + priority: 5.5 + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe5_c123_masks + predictions: pipe5_c123_predictions + targets: pipe5_c123_answers_ids + globals: + word_mappings: word_mappings_c123_without_yn + statistics: + precision: pipe5_c123_precision + recall: pipe5_c123_recall + f1score: pipe5_c123_f1score + + # C123 Predictions decoder. + pipe5_prediction_decoder: + type: WordDecoder + priority: 5.6 + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe5_c123_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 9: MERGE ANSWERS ################# + + + # Viewers. + viewer: + type: StreamViewer + priority: 9.3 + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe5_c123_masks, + answers, predicted_answers + + +#: pipeline diff --git a/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml b/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_c123_binary_yn_loss.yml similarity index 76% rename from configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml rename to configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_c123_binary_yn_loss.yml index 94af6aa..71c3946 100644 --- a/configs/vqa_med_2019/vf/c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss.yml +++ b/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_c123_binary_yn_loss.yml @@ -6,6 +6,10 @@ training: problem: categories: C1,C2,C3 export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions sampler: weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv @@ -13,10 +17,13 @@ training: validation: problem: categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions pipeline: - name: c1_c2_c3_binary_cat_rnn_shared_all_encoders_one_ffn_loss ################# PIPE 0: SHARED ################# @@ -33,14 +40,6 @@ pipeline: type: BatchSizeStatistics priority: 0.1 - # Questions encoding. - pipe1_question_tokenizer: - priority: 0.2 - type: SentenceTokenizer - streams: - inputs: questions - outputs: tokenized_questions - ################# PIPE 0: CATEGORY ################# # Model 1: question embeddings @@ -75,7 +74,6 @@ pipeline: prediction_mode: Last initial_state: Trainable use_logsoftmax: False - dropout_rate: 0.5 streams: inputs: pipe0_embedded_questions predictions: pipe0_questions_activations @@ -94,7 +92,6 @@ pipeline: freeze: True ################### hidden: [50] - dropout_rate: 0.5 streams: inputs: pipe0_questions_activations predictions: pipe0_predicted_question_categories_preds @@ -123,8 +120,6 @@ pipeline: statistics: accuracy: categorization_accuracy - - ################# PIPE 1: SHARED QUESTION ENCODER ################# # Model 1: question embeddings @@ -147,7 +142,6 @@ pipeline: prediction_mode: Last initial_state: Trainable use_logsoftmax: False - dropout_rate: 0.5 streams: inputs: embedded_questions predictions: questions_activations @@ -167,14 +161,15 @@ pipeline: inputs: answers outputs: all_answers_ids globals: - vocabulary_size: vocabulary_size_all_c1_c2_c3_binary - word_mappings: word_mappings_all_c1_c2_c3_binary + vocabulary_size: vocabulary_size_c123_binary_yn + word_mappings: word_mappings_c123_binary_yn ################# PIPE 2: SHARED IMAGE ENCODER ################# # Image encoder. image_encoder: type: TorchVisionWrapper + model: vgg16 priority: 2.1 streams: inputs: images @@ -211,10 +206,10 @@ pipeline: output_size: concatenated_activations_size - ################# PIPE 5: C1 + C2 + C2 + Binary Y/N question ################# + ################# PIPE 5: C1 + C2 + C3 + BINARY questions ################# # Answer encoding for PIPE 5. - pipe5_all_answer_indexer: + pipe5_c123_binary_yn_answer_indexer: type: LabelIndexer priority: 5.1 data_folder: ~/data/vqa-med @@ -223,72 +218,88 @@ pipeline: export_word_mappings_to_globals: True streams: inputs: answers - outputs: pipe5_all_answers_ids + outputs: pipe5_c123_binary_yn_answers_ids globals: - vocabulary_size: vocabulary_size_c1_c2_c3_binary - word_mappings: word_mappings_all_c1_c2_c3_binary + vocabulary_size: vocabulary_size_c123_binary_yn + word_mappings: word_mappings_c123_binary_yn # Sample masking based on categories. - pipe5_all_string_to_mask: + pipe5_c123_binary_yn_string_to_mask: priority: 5.2 type: StringToMask globals: word_mappings: category_c1_c2_c3_binary_yn_word_to_ix streams: strings: pipe0_predicted_question_categories_names - string_indices: predicted_c1_c2_c3_binary_by_question_categories_indices # NOT USED - masks: pipe5_all_masks + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe5_c123_binary_yn_masks # Model 4: FFN C1 answering - pipe5_all_ffn: + pipe5_c123_binary_yn_ffn: priority: 5.3 type: FeedForwardNetwork - hidden: [50] + hidden: [100] dropout_rate: 0.5 streams: inputs: concatenated_activations - predictions: pipe5_all_predictions + predictions: pipe5_c123_binary_yn_predictions globals: input_size: concatenated_activations_size - prediction_size: vocabulary_size_c1_c2_c3_binary + prediction_size: vocabulary_size_c123_binary_yn - pipe5_all_nllloss: + pipe5_c123_binary_yn_nllloss: type: NLLLoss priority: 5.4 targets_dim: 1 use_masking: True streams: - predictions: pipe5_all_predictions - masks: pipe5_all_masks - targets: pipe5_all_answers_ids - loss: pipe5_all_loss + predictions: pipe5_c123_binary_yn_predictions + masks: pipe5_c123_binary_yn_masks + targets: pipe5_c123_binary_yn_answers_ids + loss: pipe5_c123_binary_yn_loss - pipe5_all_precision_recall: + pipe5_c123_binary_yn_precision_recall: type: PrecisionRecallStatistics priority: 5.5 use_word_mappings: True use_masking: True - #show_class_scores: True + show_class_scores: True #show_confusion_matrix: True streams: - masks: pipe5_all_masks - predictions: pipe5_all_predictions - targets: pipe5_all_answers_ids + masks: pipe5_c123_binary_yn_masks + predictions: pipe5_c123_binary_yn_predictions + targets: pipe5_c123_binary_yn_answers_ids globals: - word_mappings: word_mappings_all_c1_c2_c3_binary + word_mappings: word_mappings_c123_binary_yn statistics: - precision: pipe5_all_precision - recall: pipe5_all_recall - f1score: pipe5_all_f1score + precision: pipe5_c123_binary_yn_precision + recall: pipe5_c123_binary_yn_recall + f1score: pipe5_c123_binary_yn_f1score + # C123 Predictions decoder. + pipe5_prediction_decoder: + type: WordDecoder + priority: 5.6 + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe5_c123_binary_yn_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_binary_yn ################# PIPE 9: MERGE ANSWERS ################# + # Viewers. viewer: type: StreamViewer priority: 9.3 - input_streams: questions,answers, category_names,predicted_question_categories_names, pipe5_all_masks,pipe5_all_answers_without_yn_ids,pipe5_all_predictions + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe5_c123_binary_yn_masks, + answers, predicted_answers #: pipeline diff --git a/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_c123_no_yn_loss.yml b/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_c123_no_yn_loss.yml new file mode 100644 index 0000000..1bf7bdc --- /dev/null +++ b/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_c123_no_yn_loss.yml @@ -0,0 +1,305 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + +# Validation parameters: +validation: + problem: + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + type: GlobalVariablePublisher + priority: 0 + # Add input_size to globals. + keys: [question_lstm_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, category_c1_c2_c3_without_yn_word_to_ix] + values: [100, 2, 10, 100, {"C1": 0, "C2": 1, "C3": 2}] + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 0.1 + + ################# PIPE 0: CATEGORY ################# + + # Model 1: question embeddings + pipe0_question_embeddings: + type: SentenceEmbeddings + priority: 0.3 + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + + # Model 2: question RNN + pipe0_lstm: + priority: 0.4 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.5 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + streams: + inputs: pipe0_questions_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: question_lstm_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + type: AccuracyStatistics + priority: 0.7 + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + type: SentenceEmbeddings + priority: 1.1 + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: embedded_questions + predictions: questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Answer encoding + pipe1_all_answer_indexer: + type: LabelIndexer + priority: 1.3 + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: all_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + type: TorchVisionWrapper + model: vgg16 + priority: 2.1 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + type: FeedForwardNetwork + priority: 3.1 + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 4: SHARED CONCAT ################# + + concat: + type: Concatenation + priority: 4.1 + input_streams: [questions_activations,image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,100],[-1,10]] + output_dims: [-1,210] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + + ################# PIPE 5: C1 + C2 + C3 questions ################# + + # Answer encoding for PIPE 5. + pipe5_c123_without_yn_answer_indexer: + type: LabelIndexer + priority: 5.1 + data_folder: ~/data/vqa-med + word_mappings_file: answers.c1_c2_c3_without_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe5_c123_without_yn_answers_ids + globals: + vocabulary_size: vocabulary_size_c123_without_yn + word_mappings: word_mappings_c123_without_yn + + # Sample masking based on categories. + pipe5_c123_without_yn_string_to_mask: + priority: 5.2 + type: StringToMask + globals: + word_mappings: category_c1_c2_c3_without_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe5_c123_without_yn_masks + + # Model 4: FFN C1 answering + pipe5_c123_without_yn_ffn: + priority: 5.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe5_c123_without_yn_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_c123_without_yn + + pipe5_c123_without_yn_nllloss: + type: NLLLoss + priority: 5.4 + targets_dim: 1 + use_masking: True + streams: + predictions: pipe5_c123_without_yn_predictions + masks: pipe5_c123_without_yn_masks + targets: pipe5_c123_without_yn_answers_ids + loss: pipe5_c123_without_yn_loss + + pipe5_c123_without_yn_precision_recall: + type: PrecisionRecallStatistics + priority: 5.5 + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe5_c123_without_yn_masks + predictions: pipe5_c123_without_yn_predictions + targets: pipe5_c123_without_yn_answers_ids + globals: + word_mappings: word_mappings_c123_without_yn + statistics: + precision: pipe5_c123_without_yn_precision + recall: pipe5_c123_without_yn_recall + f1score: pipe5_c123_without_yn_f1score + + # C123 Predictions decoder. + pipe5_prediction_decoder: + type: WordDecoder + priority: 5.6 + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe5_c123_without_yn_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_c123_without_yn + + ################# PIPE 9: MERGE ANSWERS ################# + + + # Viewers. + viewer: + type: StreamViewer + priority: 9.3 + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe5_c123_without_yn_masks, + answers, predicted_answers + + +#: pipeline diff --git a/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_only_yn_loss.yml b/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_only_yn_loss.yml new file mode 100644 index 0000000..7cbe09e --- /dev/null +++ b/configs/vqa_med_2019/vf/lstm_vgg16_is_cat_ffn_only_yn_loss.yml @@ -0,0 +1,305 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/default_vqa_med_2019.yml + +# Training parameters: +training: + problem: + categories: C1,C2,C3 + export_sample_weights: ~/data/vqa-med/answers.binary_yn.weights.csv + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + sampler: + weights: ~/data/vqa-med/answers.binary_yn.weights.csv + +# Validation parameters: +validation: + problem: + categories: C1,C2,C3 + # Appy all preprocessing/data augmentations. + question_preprocessing: lowercase,remove_punctuation,tokenize + streams: + questions: tokenized_questions + + +pipeline: + + ################# PIPE 0: SHARED ################# + + # Add global variables. + global_publisher: + type: GlobalVariablePublisher + priority: 0 + # Add input_size to globals. + keys: [question_lstm_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, category_binary_yn_word_to_ix] + values: [100, 2, 10, 100, {"BINARY": 3}] + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 0.1 + + ################# PIPE 0: CATEGORY ################# + + # Model 1: question embeddings + pipe0_question_embeddings: + type: SentenceEmbeddings + priority: 0.3 + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: question_embeddings + freeze: True + ################### + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: pipe0_embedded_questions + + # Model 2: question RNN + pipe0_lstm: + priority: 0.4 + type: RecurrentNeuralNetwork + cell_type: LSTM + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: lstm + freeze: True + ################### + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: pipe0_embedded_questions + predictions: pipe0_questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Model 3: FFN question category + pipe0_classifier: + priority: 0.5 + type: FeedForwardNetwork + # LOAD AND FREEZE # + load: + file: ~/image-clef-2019/experiments/q_categorization/20190416_120801/checkpoints/vqa_med_question_categorization_rnn_ffn_best.pt + model: classifier + freeze: True + ################### + hidden: [50] + streams: + inputs: pipe0_questions_activations + predictions: pipe0_predicted_question_categories_preds + globals: + input_size: question_lstm_output_size # Set by global publisher + prediction_size: num_categories # C1,C2,C3,C4, BINARY, UNK + + pipe0_category_decoder: + priority: 0.6 + type: WordDecoder + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe0_predicted_question_categories_preds + outputs: pipe0_predicted_question_categories_names + globals: + vocabulary_size: num_categories + word_mappings: category_word_mappings + + pipe0_category_accuracy: + type: AccuracyStatistics + priority: 0.7 + streams: + targets: category_ids + predictions: pipe0_predicted_question_categories_preds + statistics: + accuracy: categorization_accuracy + + ################# PIPE 1: SHARED QUESTION ENCODER ################# + + # Model 1: question embeddings + pipe1_question_embeddings: + type: SentenceEmbeddings + priority: 1.1 + embeddings_size: 50 + pretrained_embeddings_file: glove.6B.50d.txt + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: question RNN + pipe1_lstm: + priority: 1.2 + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + initial_state: Trainable + use_logsoftmax: False + streams: + inputs: embedded_questions + predictions: questions_activations + globals: + input_size: embeddings_size + prediction_size: question_lstm_output_size + + # Answer encoding + pipe1_all_answer_indexer: + type: LabelIndexer + priority: 1.3 + data_folder: ~/data/vqa-med + word_mappings_file: answers.binary_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: all_answers_ids + globals: + vocabulary_size: vocabulary_size_binary_yn + word_mappings: word_mappings_binary_yn + + ################# PIPE 2: SHARED IMAGE ENCODER ################# + + # Image encoder. + image_encoder: + type: TorchVisionWrapper + model: vgg16 + priority: 2.1 + streams: + inputs: images + outputs: image_activations + globals: + output_size: image_encoder_output_size + + ################# PIPE 3: SHARED IMAGE SIZE ENCODER ################# + + # Model - image size classifier. + image_size_encoder: + type: FeedForwardNetwork + priority: 3.1 + streams: + inputs: image_sizes + predictions: image_size_activations + globals: + input_size: image_size_encoder_input_size + prediction_size: image_size_encoder_output_size + + ################# PIPE 4: SHARED CONCAT ################# + + concat: + type: Concatenation + priority: 4.1 + input_streams: [questions_activations,image_activations,image_size_activations] + # Concatenation + dim: 1 # default + input_dims: [[-1,100],[-1,100],[-1,10]] + output_dims: [-1,210] + streams: + outputs: concatenated_activations + globals: + output_size: concatenated_activations_size + + + ################# PIPE 5: Y/N questions ################# + + # Answer encoding for PIPE 5. + pipe5_binary_yn_answer_indexer: + type: LabelIndexer + priority: 5.1 + data_folder: ~/data/vqa-med + word_mappings_file: answers.binary_yn.word.mappings.csv + # Export mappings and size to globals. + export_word_mappings_to_globals: True + streams: + inputs: answers + outputs: pipe5_binary_yn_answers_ids + globals: + vocabulary_size: vocabulary_size_binary_yn + word_mappings: word_mappings_binary_yn + + # Sample masking based on categories. + pipe5_binary_yn_string_to_mask: + priority: 5.2 + type: StringToMask + globals: + word_mappings: category_binary_yn_word_to_ix + streams: + strings: pipe0_predicted_question_categories_names + string_indices: predicted_c123_by_question_categories_indices # NOT USED + masks: pipe5_binary_yn_masks + + # Model 4: FFN C1 answering + pipe5_binary_yn_classifier: + priority: 5.3 + type: FeedForwardNetwork + hidden: [100] + dropout_rate: 0.5 + streams: + inputs: concatenated_activations + predictions: pipe5_binary_yn_predictions + globals: + input_size: concatenated_activations_size + prediction_size: vocabulary_size_binary_yn + + pipe5_binary_yn_nllloss: + type: NLLLoss + priority: 5.4 + targets_dim: 1 + use_masking: True + streams: + predictions: pipe5_binary_yn_predictions + masks: pipe5_binary_yn_masks + targets: pipe5_binary_yn_answers_ids + loss: pipe5_binary_yn_loss + + pipe5_binary_yn_precision_recall: + type: PrecisionRecallStatistics + priority: 5.5 + use_word_mappings: True + use_masking: True + show_class_scores: True + #show_confusion_matrix: True + streams: + masks: pipe5_binary_yn_masks + predictions: pipe5_binary_yn_predictions + targets: pipe5_binary_yn_answers_ids + globals: + word_mappings: word_mappings_binary_yn + statistics: + precision: pipe5_binary_yn_precision + recall: pipe5_binary_yn_recall + f1score: pipe5_binary_yn_f1score + + # Y/N Predictions decoder. + pipe5_prediction_decoder: + type: WordDecoder + priority: 5.6 + # Use the same word mappings as label indexer. + import_word_mappings_from_globals: True + streams: + inputs: pipe5_binary_yn_predictions + outputs: predicted_answers + globals: + word_mappings: word_mappings_binary_yn + + ################# PIPE 9: MERGE ANSWERS ################# + + + # Viewers. + viewer: + type: StreamViewer + priority: 9.3 + input_streams: + tokenized_questions, category_names, + pipe0_predicted_question_categories_names, + pipe5_binary_yn_masks, + answers, predicted_answers + + +#: pipeline diff --git a/ptp/components/models/__init__.py b/ptp/components/models/__init__.py index 20c2841..81a3868 100644 --- a/ptp/components/models/__init__.py +++ b/ptp/components/models/__init__.py @@ -12,6 +12,7 @@ from .vqa.element_wise_multiplication import ElementWiseMultiplication from .vqa.multimodal_compact_bilinear_pooling import MultimodalCompactBilinearPooling from .vqa.relational_network import RelationalNetwork +from .vqa.attention import VQA_Attention __all__ = [ 'ConvNetEncoder', @@ -26,5 +27,6 @@ 'ElementWiseMultiplication', 'MultimodalCompactBilinearPooling', 'RelationalNetwork', - 'Attn_Decoder_RNN' + 'Attn_Decoder_RNN', + 'VQA_Attention' ] diff --git a/ptp/components/models/torch_vision_wrapper.py b/ptp/components/models/torch_vision_wrapper.py index 3419bfb..92e4cf9 100644 --- a/ptp/components/models/torch_vision_wrapper.py +++ b/ptp/components/models/torch_vision_wrapper.py @@ -119,11 +119,24 @@ def __init__(self, name, config): self.model = models.resnet50(pretrained=pretrained) if self.return_feature_maps: - raise ConfigurationError("'resnet50' doesn't support 'return_feature_maps' mode (yet)") + # Get all modules exluding last (avgpool) and (fc) + modules=list(self.model.children())[:-2] + self.model=torch.nn.Sequential(*modules) - # Use the whole model, but cut/reshape only the last layer. - self.output_size = self.globals["output_size"] - self.model.fc = torch.nn.Linear(2048, self.output_size) + # Height of the returned features tensor (SET) + self.feature_maps_height = 7 + self.globals["feature_maps_height"] = self.feature_maps_height + # Width of the returned features tensor (SET) + self.feature_maps_width = 7 + self.globals["feature_maps_width"] = self.feature_maps_width + # Depth of the returned features tensor (SET) + self.feature_maps_depth = 2048 + self.globals["feature_maps_depth"] = self.feature_maps_depth + + else: + # Use the whole model, but cut/reshape only the last layer. + self.output_size = self.globals["output_size"] + self.model.fc = torch.nn.Linear(2048, self.output_size) def input_data_definitions(self): diff --git a/ptp/components/models/vqa/attention.py b/ptp/components/models/vqa/attention.py new file mode 100644 index 0000000..15c7914 --- /dev/null +++ b/ptp/components/models/vqa/attention.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (C) IBM Corporation 2018 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Deepta Rajan" + + +import torch + +from ptp.components.models.model import Model +from ptp.data_types.data_definition import DataDefinition + + +class VQA_Attention(Model): + """ + Element of one of the classical baselines for Visual Question Answering. + Attention-weighted image maps are computed based on the question. + The multi-modal data (question and attention-weighted image maps) are fused via concatenation and returned (for subsequent classification, done in a separate component e.g. ffn). + + On the basis of: Vahid Kazemi Ali Elqursh. "Show, Ask, Attend, and Answer: A Strong Baseline For Visual Question Answering" (2017). + Code: https://github.com/Cyanogenoid/pytorch-vqa/blob/master/model.py + """ + def __init__(self, name, config): + """ + Initializes the model, creates the required layers. + + :param name: Name of the model (taken from the configuration file). + + :param config: Parameters read from configuration file. + :type config: ``ptp.configuration.ConfigInterface`` + + """ + super(VQA_Attention, self).__init__(name, VQA_Attention, config) + + # Get key mappings. + self.key_feature_maps = self.stream_keys["feature_maps"] + self.key_question_encodings = self.stream_keys["question_encodings"] + self.key_outputs = self.stream_keys["outputs"] + + # Retrieve input/output sizes from globals. + self.feature_maps_height = self.globals["feature_maps_height"] + self.feature_maps_width = self.globals["feature_maps_width"] + self.feature_maps_depth = self.globals["feature_maps_depth"] + self.question_encoding_size = self.globals["question_encoding_size"] + + # Get size of latent space and number of heads from config. + self.latent_size = self.config["latent_size"] + self.num_attention_heads = self.config["num_attention_heads"] + + # Output feature size + self.output_size = self.feature_maps_depth*self.num_attention_heads + self.question_encoding_size + + # Map image and question encodings to a common latent space of dimension 'latent_size'. + self.image_encodings_conv = torch.nn.Conv2d(self.feature_maps_depth, self.latent_size, 1, bias=False) + self.question_encodings_ff = torch.nn.Linear(self.question_encoding_size, self.latent_size) + + # Scalar-dot product attention function is implemented as a Conv operation + self.attention_conv = torch.nn.Conv2d(self.latent_size, self.num_attention_heads, 1) + + # Create activation layer. + self.activation = torch.nn.ReLU() + + # Retrieve dropout rate value - if set, will put dropout between every layer. + dropout_rate = self.config["dropout_rate"] + + # Create dropout layer. + self.dropout = torch.nn.Dropout(dropout_rate) + + + def input_data_definitions(self): + """ + Function returns a dictionary with definitions of input data that are required by the component. + + :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_feature_maps: DataDefinition([-1, self.feature_maps_depth, self.feature_maps_height, self.feature_maps_width], [torch.Tensor], "Batch of feature maps [BATCH_SIZE x FEAT_DEPTH x FEAT_HEIGHT x FEAT_WIDTH]"), + self.key_question_encodings: DataDefinition([-1, self.question_encoding_size], [torch.Tensor], "Batch of encoded questions [BATCH_SIZE x QUESTION_ENCODING_SIZE]"), + } + + + def output_data_definitions(self): + """ + Function returns a dictionary with definitions of output data produced the component. + + :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_outputs: DataDefinition([-1, self.output_size], [torch.Tensor], "Batch of outputs [BATCH_SIZE x OUTPUT_SIZE]") + } + + def forward(self, data_dict): + """ + Main forward pass of the model. + + :param data_dict: DataDict({'images',**}) + :type data_dict: ``ptp.dadatypes.DataDict`` + """ + + # Unpack DataDict. + enc_img = data_dict[self.key_feature_maps] #[48, 2048, 7, 7] + enc_q = data_dict[self.key_question_encodings] #[48, 100] + # print("im_enc", enc_img.shape) + # print("enc_q", enc_q.shape) + + # L2 norm of image encoding + enc_img = enc_img / (enc_img.norm(p=2, dim=1, keepdim=True).expand_as(enc_img) + 1e-8) + + # Compute attention maps for image using questions + latent_img = self.image_encodings_conv(self.dropout(enc_img)) # [48, 100, 7, 7] + # print("latent_im", latent_img.shape) + latent_q = self.question_encodings_ff(self.dropout(enc_q)) # [48, 100] + # print("latent_q", latent_q.shape) + latent_q_tile = tile_2d_over_nd(latent_q, latent_img) # [48, 100, 7, 7] + # print("latent_q_tile", latent_q_tile.shape) + attention = self.activation(latent_img + latent_q_tile) # + # print("attention", attention.shape) + attention = self.attention_conv(self.dropout(attention)) # [48, 2, 7, 7] + # print("attention", attention.shape) + + # Apply attention to image encoding + attention_enc_img = apply_attention(enc_img, attention) # [48, 2048, 7, 7], [48, 2, 7, 7] + # print("attention im", attention_enc_img.shape) + + # Fusion -- Concatenate attention-weighted image encodings and question encodings. + outputs = torch.cat([attention_enc_img, latent_q], dim=1) + # print("outputs", outputs.shape) + # Add predictions to datadict. + data_dict.extend({self.key_outputs: outputs}) + + +def tile_2d_over_nd(feature_vector, feature_map): + """ Repeat the same feature vector over all spatial positions of a given feature map. + The feature vector should have the same batch size and number of features as the feature map. + """ + n, c = feature_vector.size() + spatial_size = feature_map.dim() - 2 + tiled = feature_vector.view(n, c, *([1] * spatial_size)).expand_as(feature_map) + return tiled + + +def apply_attention(input, attention): + """ Apply any number of attention maps over the input. """ + n, c = input.size()[:2] + glimpses = attention.size(1) # glimpses is equivalent to multiple heads in attention + + # flatten the spatial dims into the third dim, since we don't need to care about how they are arranged + input = input.view(n, 1, c, -1) # [n, 1, c, s] [batch, 1, channels, height*width] [48, 1, 2048, 7*7] + attention = attention.view(n, glimpses, -1) # [48, 2, 7*7] + attention = torch.nn.functional.softmax(attention, dim=-1).unsqueeze(2) # [n, g, 1, s] [batch, multi_head, 1, height*width] [48, 2, 1, 7*7] + weighted = attention * input # [n, g, c, s] [48, 2, 2048, 7*7] + weighted_mean = weighted.sum(dim=-1) # [n, g, c] [48, 2, 2048] + return weighted_mean.view(n, -1) # [48, 4196] diff --git a/ptp/components/publishers/bleu_statistics.py b/ptp/components/publishers/bleu_statistics.py index b303ea9..6432c06 100644 --- a/ptp/components/publishers/bleu_statistics.py +++ b/ptp/components/publishers/bleu_statistics.py @@ -58,6 +58,9 @@ def __init__(self, name, config): # Get masking flag. #self.use_masking = self.config["use_masking"] + # Get ignored words + self.ignored_words = self.config["ignored_words"] + # Retrieve word mappings from globals. word_to_ix = self.globals["word_mappings"] # Construct reverse mapping for faster processing. @@ -144,12 +147,16 @@ def calculate_BLEU(self, data_dict): target_words = [] for t_ind in target_indices: if t_ind in self.ix_to_word.keys(): - target_words.append(self.ix_to_word[t_ind]) + w = self.ix_to_word[t_ind] + if w not in self.ignored_words: + target_words.append(w) # Change prediction indices to words. pred_words = [] for p_ind in pred_indices: if p_ind in self.ix_to_word.keys(): - pred_words.append(self.ix_to_word[p_ind]) + w = self.ix_to_word[p_ind] + if w not in self.ignored_words: + pred_words.append(w) # Calculate BLEU. scores.append(sentence_bleu([target_words], pred_words, self.weights)) #print("TARGET: {}\n".format(target_words)) diff --git a/ptp/components/publishers/stream_file_exporter.py b/ptp/components/publishers/stream_file_exporter.py index ec34ea5..64cf46b 100644 --- a/ptp/components/publishers/stream_file_exporter.py +++ b/ptp/components/publishers/stream_file_exporter.py @@ -55,6 +55,11 @@ def __init__(self, name, config): filename = self.config["filename"] abs_filename = path.join(self.app_state.log_dir, filename) self.file = open(abs_filename, 'w') + + # Export additional line. + if self.config["export_separator_line_to_csv"]: + self.file.write("sep={}\n".format(self.separator)) + self.logger.info("Writing values from {} streams to {}".format(self.input_stream_keys, abs_filename)) diff --git a/ptp/workers/processor.py b/ptp/workers/processor.py index bfc29dc..b5afa68 100644 --- a/ptp/workers/processor.py +++ b/ptp/workers/processor.py @@ -17,7 +17,7 @@ __author__ = "Tomasz Kornuta, Vincent Marois, Younes Bouhadjar" -import os +from os import path,makedirs import torch from time import sleep from datetime import datetime @@ -93,12 +93,12 @@ def setup_global_experiment(self): exit(-2) # Check if file with model exists. - if not os.path.isfile(chkpt_file): + if not path.isfile(chkpt_file): print('Checkpoint file {} does not exist'.format(chkpt_file)) exit(-3) # Extract path. - self.abs_path, _ = os.path.split(os.path.dirname(os.path.expanduser(chkpt_file))) + self.abs_path, _ = path.split(path.dirname(path.expanduser(chkpt_file))) print(self.abs_path) # Check if config file was indicated by the user. @@ -106,10 +106,10 @@ def setup_global_experiment(self): # Split and make them absolute. root_configs = self.app_state.args.config.replace(" ", "").split(',') # If there are - expand them to absolute paths. - abs_root_configs = [os.path.expanduser(config) for config in root_configs] + abs_root_configs = [path.expanduser(config) for config in root_configs] else: # Use the "default one". - abs_root_configs = [os.path.join(self.abs_path, 'training_configuration.yml')] + abs_root_configs = [path.join(self.abs_path, 'training_configuration.yml')] # Get the list of configurations which need to be loaded. configs_to_load = config_parsing.recurrent_config_parse(abs_root_configs, [], self.app_state.absolute_config_path) @@ -160,7 +160,7 @@ def setup_individual_experiment(self): self.app_state.log_dir = self.abs_path + '/' + time_str + '/' # Lowercase dir. self.app_state.log_dir = self.app_state.log_dir.lower() - os.makedirs(self.app_state.log_dir, exist_ok=False) + makedirs(self.app_state.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: @@ -254,7 +254,7 @@ def setup_individual_experiment(self): pipeline_name = "" # Try to load the model. if pipeline_name != "": - if os.path.isfile(pipeline_name): + if path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: diff --git a/ptp/workers/trainer.py b/ptp/workers/trainer.py index c3a583c..1182574 100644 --- a/ptp/workers/trainer.py +++ b/ptp/workers/trainer.py @@ -17,7 +17,7 @@ __author__ = "Vincent Marois, Tomasz Kornuta" -import os +from os import path,makedirs import yaml import torch from time import sleep @@ -84,38 +84,25 @@ def setup_experiment(self): - Calls base class setup_experiment to parse the command line arguments, - - Loads the config file(s): + - Loads the config file(s) - >>> configs_to_load = self.recurrent_config_parse(flags.config, []) + - Set up the log directory path - - Set up the log directory path: + - Add a ``FileHandler`` to the logger - >>> os.makedirs(self.app_state.log_dir, exist_ok=False) - - - Add a ``FileHandler`` to the logger: - - >>> self.add_file_handler_to_logger(self.log_file) - - - Set random seeds: - - >>> self.set_random_seeds(self.config['training'], 'training') + - Set random seeds - Creates the pipeline consisting of many components - Creates training problem manager - - Handles curriculum learning if indicated: + - Handles curriculum learning if indicated - >>> if 'curriculum_learning' in self.config['training']: - >>> ... + - Creates validation problem manager - - Creates training problem manager - - - Set optimizer: + - Set optimizer - >>> self.optimizer = getattr(torch.optim, optimizer_name) - - - Performs testing of compatibility of both training and validation pipelines. + - Performs testing of compatibility of both training and validation problems and created pipeline. """ # Call base method to parse all command line arguments and add default sections. @@ -134,7 +121,7 @@ def setup_experiment(self): # Split and make them absolute. root_configs = self.app_state.args.config.replace(" ", "").split(',') # If there are - expand them to absolute paths. - abs_root_configs = [os.path.expanduser(config) for config in root_configs] + abs_root_configs = [path.expanduser(config) for config in root_configs] # Get the list of configurations which need to be loaded. configs_to_load = config_parse.recurrent_config_parse(abs_root_configs, [], self.app_state.absolute_config_path) @@ -168,8 +155,12 @@ def setup_experiment(self): try: pipeline_name = self.config['pipeline']['name'] except KeyError: - print("Error: Couldn't retrieve the pipeline 'name' from the loaded configuration") - exit(-1) + # Using name of the first configuration file from command line. + basename = path.basename(root_configs[0]) + # Take config filename without extension. + pipeline_name = path.splitext(basename)[0] + # Set pipeline name, so processor can use it afterwards. + self.config['pipeline'].add_config_params({'name': pipeline_name}) # Prepare the output path for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again @@ -177,10 +168,10 @@ def setup_experiment(self): time_str = '{0:%Y%m%d_%H%M%S}'.format(datetime.now()) if self.app_state.args.savetag != '': time_str = time_str + "_" + self.app_state.args.savetag - self.app_state.log_dir = os.path.expanduser(self.app_state.args.expdir) + '/' + training_problem_type + '/' + pipeline_name + '/' + time_str + '/' + self.app_state.log_dir = path.expanduser(self.app_state.args.expdir) + '/' + training_problem_type + '/' + pipeline_name + '/' + time_str + '/' # Lowercase dir. self.app_state.log_dir = self.app_state.log_dir.lower() - os.makedirs(self.app_state.log_dir, exist_ok=False) + makedirs(self.app_state.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: @@ -199,7 +190,7 @@ def setup_experiment(self): # Models dir. self.checkpoint_dir = self.app_state.log_dir + 'checkpoints/' - os.makedirs(self.checkpoint_dir, exist_ok=False) + makedirs(self.checkpoint_dir, exist_ok=False) # Set random seeds in the training section. self.set_random_seeds('training', self.config['training']) @@ -283,7 +274,7 @@ def setup_experiment(self): pipeline_name = "" # Try to load the model. if pipeline_name != "": - if os.path.isfile(pipeline_name): + if path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: