diff --git a/configs/default/workers/offline_trainer.yml b/configs/default/workers/offline_trainer.yml new file mode 100644 index 0000000..58d52ff --- /dev/null +++ b/configs/default/workers/offline_trainer.yml @@ -0,0 +1,117 @@ +#################################################################### +# Section defining all the default values of parameters used during training when using ptp-offline-trainer. + +# If you want to use different section for training pass its name as command line argument '--training_section_name' to trainer (DEFAULT: training) +# Note: in such a case remember to define all the required parameters in the new section. +training: + # Set the random seeds: -1 means that they will be picked randomly. + # Note: their final values will be stored in the final training_configuration.yml saved to log dir. + seed_numpy: -1 + seed_torch: -1 + + # Default batch size. + batch_size: 64 + + # Definition of the problem (Mandatory!) + #problem: + # One must define its type (Mandatory!) + # type: ? + # The rest of the content of that section is problem-specific... + + # Section describing curriculum learning (Optional) + #curriculum_learning: + # # Flag indicating whether curriculum learning has to finish before (eventual) termination of the training. + # must_finish: True + # The rest of the content of that section is problem-specific... + + # Definition of optimizer (Mandatory!) + #optimizer: + # # Type - generally all optimizers from PyTorch.optim are allowed (Mandatory!) + # type: Adam + # # Options: + # lr: 0.0001 + # The rest of the content of that section is optimizer-specific... + + # Set a default configuration section for data loader. + dataloader: + # Shuffle set by default. + shuffle: True + batch_sampler: None + # Do not use multiprocessing by default. + num_workers: 0 + pin_memory: False + # Do not drop last frame by default. + drop_last: False + timeout: 0 + + # Definition of sampler (Optional) + # When this section will not be present, worker will use "standard" sampling (please refer to shuffle in dataloader) + #sampler: + # # Type - generally all samplers from PyTorch (plus some new onses) are allowed (Mandatory!) + # # Options: + # type: RandomSmpler + # The rest of the content of that section is optimizer-specific... + + # Terminal conditions that will be used during training. + # They can (and ofter should) be overwritten. + terminal_conditions: + # Terminal condition I: loss threshold, going below will terminate the training. + loss_stop_threshold: 0.00001 # 1e-5 + # Terminal condition II: Early stopping monitor validation loss, if it didn't down during last n validations, training will be terminated (Optional, negative means that this condition is disabled) + early_stop_validations: 10 + # Terminal condition III: maximal number of epochs (Mandatory for this trainer! Must be > 0) + epoch_limit: 10 + # Terminal condition IV: maximal number of episodes (Optional, -1 (negative) means that this condition is disabled) + episode_limit: -1 + + + +#################################################################### +# Section defining all the default values of parameters used during validation. +# If you want to use different section for validation pass its name as command line argument '--validation_section_name' to trainer (DEFAULT: validation) +# Note: in such a case remember to define all the required parameters in the new section. +validation: + # Defines how often the partial validation will be performed. + # In this trainer Partial Validation is optional (negative value means it is disabled) + partial_validation_interval: -1 + + # Definition of the problem (mandatory!) + #problem: + # One must define its type (Mandatory!) + # type: ? + # The rest of the content of that section is problem-specific... + + # Set a default configuration section for data loader. + dataloader: + # Shuffle set by default. + shuffle: True + # Do not use multiprocessing by default. + num_workers: 0 + pin_memory: False + # Do not drop last frame by default. + drop_last: False + timeout: 0 + + # Definition of sampler (Optional) + # When this section will not be present, worker will use "standard" sampling (please refer to shuffle in dataloader) + #sampler: + # # Type - generally all samplers from PyTorch (plus some new onses) are allowed (Mandatory!) + # # Options: + # type: RandomSmpler + # The rest of the content of that section is optimizer-specific... + + + +#################################################################### +# Section defining all the default values of parameters used during training. +# If you want to use different section for validation pass its name as command line argument '--pipeline_section_name' to trainer (DEFAULT: pipeline) +pipeline: + # Pipeline must contain at least one component. + #name_1: + # Each component must have defined its priority... (Mandatory!) + # priority: 0.1 # Can be float. Smaller means higher priority, up to zero. + # # ... and type (Mandatory!) + # type: ? + # The rest of the content of that section is component-specific... + + diff --git a/configs/default/workers/online_trainer.yml b/configs/default/workers/online_trainer.yml index 96fe8ed..a8783fe 100644 --- a/configs/default/workers/online_trainer.yml +++ b/configs/default/workers/online_trainer.yml @@ -1,5 +1,6 @@ #################################################################### -# Section defining all the default values of parameters used during training. +# Section defining all the default values of parameters used during training when using ptp-online-trainer. + # If you want to use different section for training pass its name as command line argument '--training_section_name' to trainer (DEFAULT: training) # Note: in such a case remember to define all the required parameters in the new section. training: @@ -55,10 +56,12 @@ training: # They can (and ofter should) be overwritten. terminal_conditions: # Terminal condition I: loss threshold, going below will terminate the training. - loss_stop: 0.00001 # 1e-5 - # Terminal condition II: maximal number of epochs (optional, -1 means that this condition is disabled) + loss_stop_threshold: 0.00001 # 1e-5 + # Terminal condition II: Early stopping monitor validation loss, if it didn't down during last n validations, training will be terminated (Optional, negative means that this condition is disabled) + early_stop_validations: 10 + # Terminal condition III: maximal number of epochs (Optional, -1 (negative) means that this condition is disabled) epoch_limit: -1 - # Terminal condition III: maximal number of episodes (Mandatory for this trainer! Must be > 0) + # Terminal condition IV: maximal number of episodes (Mandatory for this trainer! Must be > 0) episode_limit: 100000 diff --git a/configs/default/workers/processor.yml b/configs/default/workers/processor.yml index e04a388..7737233 100644 --- a/configs/default/workers/processor.yml +++ b/configs/default/workers/processor.yml @@ -34,7 +34,7 @@ test: #sampler: # # Type - generally all samplers from PyTorch (plus some new onses) are allowed (Mandatory!) # # Options: - # type: RandomSmpler + # type: RandomSampler # The rest of the content of that section is optimizer-specific... diff --git a/configs/mnist/default_mnist.yml b/configs/mnist/default_mnist.yml index 5565041..d8c849f 100644 --- a/configs/mnist/default_mnist.yml +++ b/configs/mnist/default_mnist.yml @@ -2,35 +2,36 @@ training: problem: type: MNIST - batch_size: &b 6400 + batch_size: &b 64 use_train_data: True #resize: [32, 32] # Use sampler that operates on a subset. - #sampler: - # type: SubsetRandomSampler - # indices: [0, 55000] + sampler: + type: SubsetRandomSampler + indices: [0, 55000] # optimizer parameters: optimizer: type: Adam lr: 0.0001 # settings parameters terminal_conditions: - loss_stop: 0.05 + loss_stop_threshold: 0.05 + early_stop_validations: 10 episode_limit: 10000 epoch_limit: 10 # Validation parameters: validation: - partial_validation_interval: 500 + #partial_validation_interval: 100 problem: type: MNIST batch_size: *b use_train_data: True # True because we are splitting the training set to: validation and training #resize: [32, 32] # Use sampler that operates on a subset. - #sampler: - # type: SubsetRandomSampler - # indices: [55000, 60000] + sampler: + type: SubsetRandomSampler + indices: [55000, 60000] # Testing parameters: test: diff --git a/configs/mnist/mnist_classification_kfold_softmax.yml b/configs/mnist/mnist_classification_kfold_softmax.yml index cfa0402..9685e82 100644 --- a/configs/mnist/mnist_classification_kfold_softmax.yml +++ b/configs/mnist/mnist_classification_kfold_softmax.yml @@ -7,18 +7,17 @@ training: type: MNIST batch_size: &b 64 use_train_data: True - #resize: [32, 32] # Use k-fold cross-validation random sampler. sampler: type: kFoldRandomSampler - folds: 7 # Each with size of 6000 + folds: 10 # Each with size of 6000 # optimizer parameters: optimizer: type: Adam lr: 0.0001 # settings parameters terminal_conditions: - loss_stop: 0.5 + loss_stop_threshold: 0.5 episode_limit: 10000 epoch_limit: 5 @@ -29,11 +28,10 @@ validation: type: MNIST batch_size: *b use_train_data: True # True because we are splitting the training set to: validation and training - #resize: [32, 32] # Use k-fold cross-validation random sampler. sampler: type: kFoldRandomSampler - folds: 7 # Each with size of 6000 + folds: 10 # Each with size of 6000 pipeline: diff --git a/configs/mnist/mnist_classification_vf_2lenet5.yml b/configs/mnist/mnist_classification_vf_2lenet5_2losses.yml similarity index 88% rename from configs/mnist/mnist_classification_vf_2lenet5.yml rename to configs/mnist/mnist_classification_vf_2lenet5_2losses.yml index 6737421..305df53 100644 --- a/configs/mnist/mnist_classification_vf_2lenet5.yml +++ b/configs/mnist/mnist_classification_vf_2lenet5_2losses.yml @@ -20,7 +20,7 @@ test: pipeline: # Disable components for "default" flow. - disable: nllloss, precision_recall + disable: nllloss, accuracy, precision_recall # Add global variables. global_publisher: @@ -30,30 +30,39 @@ pipeline: values: [3, 7, {"Zero": 0, "One": 1, "Two": 2}, {"Three": 0, "Four": 1, "Five": 2, "Six": 3, "Seven": 4, "Eight": 5, "Nine": 6}] ################# Flow 1 ################# - flow1_string_to_mask: - type: StringToMask - priority: 1.1 - globals: - word_mappings: word_to_ix1 - streams: - strings: labels - string_indices: flow1_targets - masks: flow1_masks - # Image classifier. flow1_image_classifier: type: LeNet5 - priority: 1.2 + priority: 1.1 globals: prediction_size: num_classes1 streams: inputs: inputs predictions: flow1_predictions + flow1_label_to_mask1: + type: StringToMask + priority: 1.2 + globals: + word_mappings: word_to_ix1 + streams: + strings: labels + masks: flow1_masks + + flow1_label_to_target1: + type: LabelIndexer + priority: 1.3 + import_word_mappings_from_globals: True + globals: + word_mappings: word_to_ix1 + streams: + inputs: labels + outputs: flow1_targets + # Masked loss. flow1_nllloss: type: NLLLoss - priority: 1.31 + priority: 1.4 use_masking: True streams: targets: flow1_targets @@ -64,7 +73,7 @@ pipeline: # Statistics. flow1_accuracy: type: AccuracyStatistics - priority: 1.32 + priority: 1.51 use_masking: True streams: predictions: flow1_predictions @@ -75,7 +84,7 @@ pipeline: flow1_precision_recall: type: PrecisionRecallStatistics - priority: 1.33 + priority: 1.52 use_word_mappings: True show_class_scores: True show_confusion_matrix: True @@ -93,30 +102,39 @@ pipeline: f1score: flow1_f1score ################# Flow 2 ################# - flow2_string_to_mask: - type: StringToMask - priority: 2.1 - globals: - word_mappings: word_to_ix2 - streams: - strings: labels - string_indices: flow2_targets - masks: flow2_masks - # Image classifier. flow2_image_classifier: type: LeNet5 - priority: 2.2 + priority: 2.1 globals: prediction_size: num_classes2 streams: inputs: inputs predictions: flow2_predictions + flow2_label_to_mask2: + type: StringToMask + priority: 2.2 + globals: + word_mappings: word_to_ix2 + streams: + strings: labels + masks: flow2_masks + + flow2_label_to_target2: + type: LabelIndexer + priority: 2.3 + import_word_mappings_from_globals: True + globals: + word_mappings: word_to_ix2 + streams: + inputs: labels + outputs: flow2_targets + # Masked loss. flow2_nllloss: type: NLLLoss - priority: 2.31 + priority: 2.4 use_masking: True streams: targets: flow2_targets @@ -127,7 +145,7 @@ pipeline: # Statistics. flow2_accuracy: type: AccuracyStatistics - priority: 2.32 + priority: 2.41 use_masking: True streams: targets: flow2_targets @@ -138,7 +156,7 @@ pipeline: flow2_precision_recall: type: PrecisionRecallStatistics - priority: 2.33 + priority: 2.42 use_word_mappings: True show_class_scores: True show_confusion_matrix: True diff --git a/configs/mnist/mnist_classification_vf_lenet5.yml b/configs/mnist/mnist_classification_vf_lenet5.yml deleted file mode 100644 index 731f180..0000000 --- a/configs/mnist/mnist_classification_vf_lenet5.yml +++ /dev/null @@ -1,125 +0,0 @@ -# Load config defining MNIST problems for training, validation and testing. -default_configs: mnist/default_mnist.yml - -# Training parameters - overwrite defaults: -training: - problem: - resize_image: [32, 32] - -# Validation parameters - overwrite defaults: -validation: - problem: - resize_image: [32, 32] - -# Testing parameters - overwrite defaults: -test: - problem: - resize_image: [32, 32] - -# Definition of the pipeline. -pipeline: - - # Disable components for "default" flow. - disable: nllloss, precision_recall - - ################# SHARED ################# - # Add global variables. - global_publisher: - type: GlobalVariablePublisher - priority: 0.1 - keys: [word_to_ix1, word_to_ix2] - values: [{"Zero": 0, "One": 1, "Two": 2, "Three": 3}, {"Four": 4, "Five": 5, "Six": 6, "Seven": 7, "Eight": 8, "Nine": 9}] - - # Image classifier. - image_classifier: - type: LeNet5 - priority: 1.2 - globals: - prediction_size: num_classes - streams: - inputs: inputs - predictions: predictions - - all_precision_recall: - type: PrecisionRecallStatistics - priority: 100.1 - use_word_mappings: True - show_class_scores: True - streams: - predictions: predictions - globals: - word_mappings: label_word_mappings - statistics: - precision: all_precision - recall: all_recall - f1score: all_f1score - - ################# Flow 1 ################# - flow1_string_to_mask: - type: StringToMask - priority: 2.1 - globals: - word_mappings: word_to_ix1 - streams: - strings: labels - string_indices: flow1_targets - masks: flow1_masks - - # Masked loss. - nllloss_flow1: - type: NLLLoss - priority: 10.1 - use_masking: True - streams: - predictions: predictions - targets: flow1_targets - masks: flow1_masks - - # Statistics. - flow1_precision_recall: - type: PrecisionRecallStatistics - priority: 100.3 - use_masking: True - use_word_mappings: True - show_class_scores: True - streams: - predictions: predictions - targets: flow1_targets - masks: flow1_masks - globals: - word_mappings: word_to_ix1 - statistics: - precision: flow1_precision - recall: flow1_recall - f1score: flow1_f1score - - ################# Flow 2 ################# - flow2_string_to_mask: - type: StringToMask - priority: 2.2 - globals: - word_mappings: word_to_ix2 - streams: - strings: labels - string_indices: flow2_targets - masks: flow2_masks - - flow2_precision_recall: - type: PrecisionRecallStatistics - priority: 100.5 - use_masking: True - use_word_mappings: True - show_class_scores: True - streams: - predictions: predictions - targets: flow2_targets - masks: flow2_masks - globals: - word_mappings: word_to_ix2 - statistics: - precision: flow2_precision - recall: flow2_recall - f1score: flow2_f1score - - -#: pipeline diff --git a/configs/mnist/mnist_classification_vf_lenet5_2losses.yml b/configs/mnist/mnist_classification_vf_lenet5_2losses.yml deleted file mode 100644 index a685120..0000000 --- a/configs/mnist/mnist_classification_vf_lenet5_2losses.yml +++ /dev/null @@ -1,133 +0,0 @@ -# Load config defining MNIST problems for training, validation and testing. -default_configs: mnist/default_mnist.yml - -# Training parameters - overwrite defaults: -training: - problem: - resize_image: [32, 32] - -# Validation parameters - overwrite defaults: -validation: - problem: - resize_image: [32, 32] - -# Testing parameters - overwrite defaults: -test: - problem: - resize_image: [32, 32] - -# Definition of the pipeline. -pipeline: - - # Disable components for "default" flow. - disable: nllloss, precision_recall - - ################# SHARED ################# - - # Add global variables. - global_publisher: - type: GlobalVariablePublisher - priority: 0.1 - keys: [word_to_ix1, word_to_ix2] - values: [{"Zero": 0, "One": 1, "Two": 2, "Three": 3}, {"Four": 4, "Five": 5, "Six": 6, "Seven": 7, "Eight": 8, "Nine": 9}] - - # Image classifier. - image_classifier: - type: LeNet5 - priority: 1.2 - globals: - prediction_size: num_classes - streams: - inputs: inputs - predictions: predictions - - all_precision_recall: - type: PrecisionRecallStatistics - priority: 100.1 - use_word_mappings: True - show_class_scores: True - streams: - predictions: predictions - globals: - word_mappings: label_word_mappings - statistics: - precision: all_precision - recall: all_recall - f1score: all_f1score - - ################# Flow 1 ################# - flow1_string_to_mask: - type: StringToMask - priority: 2.1 - globals: - word_mappings: word_to_ix1 - streams: - strings: labels - string_indices: flow1_targets - masks: flow1_masks - - # Masked loss. - flow1_nllloss: - type: NLLLoss - priority: 10.1 - use_masking: True - streams: - predictions: predictions - masks: flow1_masks - loss: flow1_loss - - # Statistics. - flow1_precision_recall: - type: PrecisionRecallStatistics - priority: 100.3 - use_masking: True - use_word_mappings: True - show_class_scores: True - streams: - predictions: predictions - masks: flow1_masks - globals: - word_mappings: word_to_ix1 - statistics: - precision: flow1_precision - recall: flow1_recall - f1score: flow1_f1score - - ################# Flow 2 ################# - flow2_string_to_mask: - type: StringToMask - priority: 2.2 - globals: - word_mappings: word_to_ix2 - streams: - strings: labels - string_indices: flow2_targets - masks: flow2_masks - - # Masked loss. - flow2_nllloss: - type: NLLLoss - priority: 10.2 - use_masking: True - streams: - predictions: predictions - masks: flow2_masks - loss: flow2_loss - - flow2_precision_recall: - type: PrecisionRecallStatistics - priority: 100.5 - use_masking: True - use_word_mappings: True - show_class_scores: True - streams: - predictions: predictions - masks: flow2_masks - globals: - word_mappings: word_to_ix2 - statistics: - precision: flow2_precision - recall: flow2_recall - f1score: flow2_f1score - -#: pipeline diff --git a/configs/mnist/mnist_classification_vf_shared_convnet_2softmaxes.yml b/configs/mnist/mnist_classification_vf_shared_convnet_2softmaxes_2losses.yml similarity index 82% rename from configs/mnist/mnist_classification_vf_shared_convnet_2softmaxes.yml rename to configs/mnist/mnist_classification_vf_shared_convnet_2softmaxes_2losses.yml index c782099..e934f64 100644 --- a/configs/mnist/mnist_classification_vf_shared_convnet_2softmaxes.yml +++ b/configs/mnist/mnist_classification_vf_shared_convnet_2softmaxes_2losses.yml @@ -10,7 +10,7 @@ training: # #type: Adam # lr: 0.001 #terminal_conditions: - # loss_stop: 0.08 + # loss_stop_threshold: 0.08 # Validation parameters - overwrite defaults: #validation: @@ -27,7 +27,7 @@ training: pipeline: # Disable components for "default" flow. - disable: nllloss, precision_recall + disable: nllloss, accuracy, precision_recall ################# SHARED ################# @@ -57,33 +57,41 @@ pipeline: output_size: reshaped_maps_size ################# Flow 1 ################# - flow1_string_to_mask: - type: StringToMask - priority: 1.1 - globals: - word_mappings: word_to_ix1 - streams: - strings: labels - string_indices: flow1_targets - masks: flow1_masks - # Classifier. flow1_classifier: type: FeedForwardNetwork - priority: 1.2 + priority: 1.1 streams: globals: input_size: reshaped_maps_size prediction_size: num_classes1 streams: inputs: reshaped_maps - targets: flow1_targets predictions: flow1_predictions + flow1_label_to_mask1: + type: StringToMask + priority: 1.2 + globals: + word_mappings: word_to_ix1 + streams: + strings: labels + masks: flow1_masks + + flow1_label_to_target1: + type: LabelIndexer + priority: 1.3 + import_word_mappings_from_globals: True + globals: + word_mappings: word_to_ix1 + streams: + inputs: labels + outputs: flow1_targets + # Masked loss. flow1_nllloss: type: NLLLoss - priority: 1.31 + priority: 1.4 use_masking: True streams: targets: flow1_targets @@ -92,9 +100,20 @@ pipeline: loss: flow1_loss # Statistics. + flow1_accuracy: + type: AccuracyStatistics + priority: 1.51 + use_masking: True + streams: + predictions: flow1_predictions + targets: flow1_targets + masks: flow1_masks + statistics: + accuracy: flow1_accuracy + flow1_precision_recall: type: PrecisionRecallStatistics - priority: 1.33 + priority: 1.52 use_word_mappings: True show_class_scores: True show_confusion_matrix: True @@ -112,33 +131,41 @@ pipeline: f1score: flow1_f1score ################# Flow 2 ################# - flow2_string_to_mask: - type: StringToMask - priority: 2.1 - globals: - word_mappings: word_to_ix2 - streams: - strings: labels - string_indices: flow2_targets - masks: flow2_masks - # Classifier. flow2_classifier: type: FeedForwardNetwork - priority: 2.2 + priority: 2.1 streams: globals: input_size: reshaped_maps_size prediction_size: num_classes2 streams: inputs: reshaped_maps - targets: flow2_targets predictions: flow2_predictions + flow2_label_to_mask2: + type: StringToMask + priority: 2.2 + globals: + word_mappings: word_to_ix2 + streams: + strings: labels + masks: flow2_masks + + flow2_label_to_target2: + type: LabelIndexer + priority: 2.3 + import_word_mappings_from_globals: True + globals: + word_mappings: word_to_ix2 + streams: + inputs: labels + outputs: flow2_targets + # Masked loss. flow2_nllloss: type: NLLLoss - priority: 2.31 + priority: 2.4 use_masking: True streams: targets: flow2_targets @@ -147,9 +174,20 @@ pipeline: loss: flow2_loss # Statistics. + flow2_accuracy: + type: AccuracyStatistics + priority: 2.41 + use_masking: True + streams: + targets: flow2_targets + predictions: flow2_predictions + masks: flow2_masks + statistics: + accuracy: flow2_accuracy + flow2_precision_recall: type: PrecisionRecallStatistics - priority: 2.33 + priority: 2.42 use_word_mappings: True show_class_scores: True show_confusion_matrix: True diff --git a/configs/translation/eng_fra_translation_enc_attndec.yml b/configs/translation/eng_fra_translation_enc_attndec.yml index bba5363..a299011 100644 --- a/configs/translation/eng_fra_translation_enc_attndec.yml +++ b/configs/translation/eng_fra_translation_enc_attndec.yml @@ -19,7 +19,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 1000000 epoch_limit: 100 diff --git a/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml b/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml index 8f106c0..022a16a 100644 --- a/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml +++ b/configs/vqa_med_2019/c4_classification/c4_enc_attndec.yml @@ -15,7 +15,7 @@ training: num_workers: 2 # Termination. terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 1000000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml b/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml index fa48129..92c722c 100644 --- a/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml +++ b/configs/vqa_med_2019/c4_classification/c4_enc_attndec_resnet152_ewm_cat_is.yml @@ -15,7 +15,7 @@ training: num_workers: 4 # Termination. terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 1000000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/c4_classification/c4_frozen_if_gru_dec.yml b/configs/vqa_med_2019/c4_classification/c4_frozen_if_gru_dec.yml index 1b7dd1a..e9ff65a 100644 --- a/configs/vqa_med_2019/c4_classification/c4_frozen_if_gru_dec.yml +++ b/configs/vqa_med_2019/c4_classification/c4_frozen_if_gru_dec.yml @@ -45,7 +45,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/default_vqa_med_2019.yml b/configs/vqa_med_2019/default_vqa_med_2019.yml index ae570a1..aa04ba4 100644 --- a/configs/vqa_med_2019/default_vqa_med_2019.yml +++ b/configs/vqa_med_2019/default_vqa_med_2019.yml @@ -26,7 +26,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml index 8c7f748..ad78835 100644 --- a/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_resnet50_coattn_mfb_is_cat_ffn_c123_loss.yml @@ -65,7 +65,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 2000 #10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml b/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml index 551f57c..da816bd 100644 --- a/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml +++ b/configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml @@ -65,7 +65,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 5000 #10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml index 831e894..03857bd 100644 --- a/configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/example_mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml @@ -70,7 +70,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/frozen_if_ffn_c1234_loss.yml b/configs/vqa_med_2019/evaluation/frozen_if_ffn_c1234_loss.yml index da7212f..f5c9328 100644 --- a/configs/vqa_med_2019/evaluation/frozen_if_ffn_c1234_loss.yml +++ b/configs/vqa_med_2019/evaluation/frozen_if_ffn_c1234_loss.yml @@ -48,7 +48,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/frozen_if_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/frozen_if_ffn_c123_loss.yml index e061c56..6340fe3 100644 --- a/configs/vqa_med_2019/evaluation/frozen_if_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/frozen_if_ffn_c123_loss.yml @@ -48,7 +48,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_c1234yn_5losses.yml b/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_c1234yn_5losses.yml index 1a072bd..ea732e9 100644 --- a/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_c1234yn_5losses.yml +++ b/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_c1234yn_5losses.yml @@ -48,7 +48,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_support_c1234yn_5losses.yml b/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_support_c1234yn_5losses.yml index 62441f5..073b78e 100644 --- a/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_support_c1234yn_5losses.yml +++ b/configs/vqa_med_2019/evaluation/frozen_if_vf_5ffn_support_c1234yn_5losses.yml @@ -63,7 +63,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml index 038a565..bf81cb7 100644 --- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_att_is_cat_ffn_c123_loss.yml @@ -71,7 +71,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml index 5f937e7..f765357 100644 --- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml @@ -69,7 +69,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml index cd2ec31..9bfb01f 100644 --- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml @@ -71,7 +71,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml index 0014c9e..d72c1e6 100644 --- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml @@ -69,7 +69,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml index 1808c57..5a8bf30 100644 --- a/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_mcb_is_cat_ffn_c123_loss.yml @@ -69,7 +69,7 @@ training: # Terminal conditions: terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 episode_limit: 10000 epoch_limit: -1 diff --git a/configs/vqa_med_2019/question_categorization/default_question_categorization.yml b/configs/vqa_med_2019/question_categorization/default_question_categorization.yml index 0e7e5c0..ee3873f 100644 --- a/configs/vqa_med_2019/question_categorization/default_question_categorization.yml +++ b/configs/vqa_med_2019/question_categorization/default_question_categorization.yml @@ -11,7 +11,7 @@ training: sampler: weights: ~/data/vqa-med/answers.all.weights.csv terminal_conditions: - loss_stop: 1.0e-3 + loss_stop_threshold: 1.0e-3 validation: problem: diff --git a/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml b/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml index 55a1261..dbfe301 100644 --- a/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml +++ b/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml @@ -21,7 +21,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 1000000 epoch_limit: 100 diff --git a/configs/wikitext/wikitext_language_modeling_rnn.yml b/configs/wikitext/wikitext_language_modeling_rnn.yml index 3bd504a..d92e13d 100644 --- a/configs/wikitext/wikitext_language_modeling_rnn.yml +++ b/configs/wikitext/wikitext_language_modeling_rnn.yml @@ -15,7 +15,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 10000 epoch_limit: 100 diff --git a/configs/wikitext/wikitext_language_modeling_seq2seq.yml b/configs/wikitext/wikitext_language_modeling_seq2seq.yml index 49f611b..e9b2388 100644 --- a/configs/wikitext/wikitext_language_modeling_seq2seq.yml +++ b/configs/wikitext/wikitext_language_modeling_seq2seq.yml @@ -21,7 +21,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 1000000 epoch_limit: 100 diff --git a/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml b/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml index 4e9d0d6..606e506 100644 --- a/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml +++ b/configs/wikitext/wikitext_language_modeling_seq2seq_simple.yml @@ -21,7 +21,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 1000000 epoch_limit: 100 diff --git a/configs/wily/dummy_language_identification_bow.yml b/configs/wily/dummy_language_identification_bow.yml index 2ff27a0..6a31ac7 100644 --- a/configs/wily/dummy_language_identification_bow.yml +++ b/configs/wily/dummy_language_identification_bow.yml @@ -15,7 +15,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 10000 epoch_limit: 100 diff --git a/configs/wily/wily_language_identification_bow.yml b/configs/wily/wily_language_identification_bow.yml index aa84c8c..b5f4c29 100644 --- a/configs/wily/wily_language_identification_bow.yml +++ b/configs/wily/wily_language_identification_bow.yml @@ -20,7 +20,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 10000 epoch_limit: 100 diff --git a/configs/wily/wily_ngram_language_modeling.yml b/configs/wily/wily_ngram_language_modeling.yml index 316a95a..65b2650 100644 --- a/configs/wily/wily_ngram_language_modeling.yml +++ b/configs/wily/wily_ngram_language_modeling.yml @@ -22,7 +22,7 @@ training: # settings parameters terminal_conditions: - loss_stop: 1.0e-2 + loss_stop_threshold: 1.0e-2 episode_limit: 10000 epoch_limit: 100 diff --git a/ptp/application/pipeline_manager.py b/ptp/application/pipeline_manager.py index f0e282f..c5ee329 100644 --- a/ptp/application/pipeline_manager.py +++ b/ptp/application/pipeline_manager.py @@ -20,7 +20,7 @@ import os import torch from datetime import datetime -from numpy import inf +from numpy import inf,average import ptp.components @@ -65,6 +65,9 @@ def __init__(self, name, config): # Initialization of best loss - as INF. self.best_loss = inf self.best_status = "Unknown" + # Indicates the last time when the validation loss went down. + # 0 means currntly, 1 means during previous validation etc. + self.validation_loss_down_counter = 0 def build(self, use_logger=True): @@ -246,6 +249,8 @@ def save(self, chkpt_dir, training_status, loss): log_str = "Exporting pipeline '{}' parameters to checkpoint:\n {}\n".format(self.name, filename) log_str += model_str self.logger.info(log_str) + # Ok, loss went down, reset the counter. + self.validation_loss_down_counter = 0 return True elif self.best_status != training_status: filename = chkpt_dir + self.name + '_best.pt' @@ -258,6 +263,8 @@ def save(self, chkpt_dir, training_status, loss): torch.save(chkpt_loaded, filename) self.logger.info("Updated training status in checkpoint:\n {}".format(filename)) # Else: that was not the best "model". + # Loss didn't went down, increment the counter. + self.validation_loss_down_counter += 1 return False def load(self, checkpoint_file): @@ -613,7 +620,18 @@ def backward(self, data_dict): data_dict[key].backward(retain_graph=True) - def get_loss(self, data_dict): + def return_loss_on_batch(self, stat_col): + """ + Sums all losses and returns a single value that can be used e.g. in terminal condition or model(s) saving. + + :param data_dict: :py:class:`ptp.utils.DataDict` object containing both input data to be processed and that will be extended by the results. + + :return: Loss (scalar value). + """ + return stat_col["total_loss"][-1] + + + def return_loss_on_set(self, stat_agg): """ Sums all losses and returns a single value that can be used e.g. in terminal condition or model(s) saving. @@ -621,18 +639,8 @@ def get_loss(self, data_dict): :return: Loss (scalar value). """ - if (len(self.losses) == 0): - raise ConfigurationError("Cannot train using backpropagation as there are no 'Loss' components") - loss_sum = 0 - num_losses = 0 - for loss in self.losses: - for key in loss.loss_keys(): - loss_sum += data_dict[key].cpu().item() - num_losses +=1 - # Display additional information for multi-loss pipelines. - if num_losses > 1: - self.logger.info("Total loss: {}".format(loss_sum)) - return loss_sum + + return stat_agg["total_loss"] def parameters(self, recurse=True): @@ -677,6 +685,20 @@ def add_statistics(self, stat_col): comp = self.__components[prio] comp.add_statistics(stat_col) + # Check number of losses in the pipeline. + num_losses = 0 + for loss in self.losses: + num_losses += len(loss.loss_keys()) + self.show_total_loss = (num_losses > 1) + + # Additional "total loss" (for single- and multi-loss pipelines). + # Collect it always, but show it only for multi-loss pipelines. + if self.show_total_loss: + stat_col.add_statistics("total_loss", '{:12.10f}') + else: + stat_col.add_statistics("total_loss", None) + stat_col.add_statistics("total_loss_support", None) + def collect_statistics(self, stat_col, data_dict): """ @@ -692,6 +714,14 @@ def collect_statistics(self, stat_col, data_dict): comp = self.__components[prio] comp.collect_statistics(stat_col, data_dict) + # Additional "total loss" (for single- and multi-loss pipelines). + loss_sum = 0 + for loss in self.losses: + for key in loss.loss_keys(): + loss_sum += data_dict[key].cpu().item() + stat_col["total_loss"] = loss_sum + stat_col["total_loss_support"] = data_dict["indices"].shape[0] # batch size + def add_aggregators(self, stat_agg): """ @@ -704,6 +734,13 @@ def add_aggregators(self, stat_agg): comp = self.__components[prio] comp.add_aggregators(stat_agg) + # Additional "total loss" (for single- and multi-loss pipelines). + # Collect it always, but show it only for multi-loss pipelines. + if self.show_total_loss: + stat_agg.add_aggregator("total_loss", '{:12.10f}') + else: + stat_agg.add_aggregator("total_loss", None) + def aggregate_statistics(self, stat_col, stat_agg): """ @@ -717,3 +754,14 @@ def aggregate_statistics(self, stat_col, stat_agg): for prio in self.__priorities: comp = self.__components[prio] comp.aggregate_statistics(stat_col, stat_agg) + + # Additional "total loss" (for single- and multi-loss pipelines). + total_losses = stat_col["total_loss"] + supports = stat_col["total_loss_support"] + + # Special case - no samples! + if sum(supports) == 0: + stat_agg.aggregators["total_loss"] = 0 + else: + # Calculate default aggregate - weighted mean. + stat_agg.aggregators["total_loss"] = average(total_losses, weights=supports) diff --git a/ptp/application/sampler_factory.py b/ptp/application/sampler_factory.py index c047065..d3bbbae 100644 --- a/ptp/application/sampler_factory.py +++ b/ptp/application/sampler_factory.py @@ -85,19 +85,19 @@ def build(problem, config, problem_subset_name): logger = logging.initialize_logger('SamplerFactory') try: - # Check presence of the name attribute. - if 'name' not in config: - raise ConfigurationError("The sampler configuration section does not contain the key 'name'") + # Check presence of the typename attribute. + if 'type' not in config: + raise ConfigurationError("The sampler configuration section does not contain the key 'type'") - # Get the class name. - name = config['name'] - logger.info('Trying to instantiate the {} sampler object'.format(name)) + # Get the class typename. + typename = config['type'] + logger.info('Trying to instantiate the {} sampler object'.format(typename)) ########################################################################### # Handle first special case: SubsetRandomSampler. - if name == 'SubsetRandomSampler': + if typename == 'SubsetRandomSampler': - # Check presence of the name attribute. + # Check presence of the typename attribute. if 'indices' not in config: raise ConfigurationError("The sampler configuration section does not contain the key 'indices' " "required by SubsetRandomSampler") @@ -145,7 +145,7 @@ def build(problem, config, problem_subset_name): ########################################################################### # Handle second special case: WeightedRandomSampler. - elif name == 'WeightedRandomSampler': + elif typename == 'WeightedRandomSampler': # Check presence of the attribute. if 'weights' not in config: @@ -160,7 +160,7 @@ def build(problem, config, problem_subset_name): ########################################################################### # Handle third special case: kFoldRandomSampler. - elif name == 'kFoldRandomSampler': + elif typename == 'kFoldRandomSampler': # Check presence of the attribute. if 'folds' not in config: @@ -179,7 +179,7 @@ def build(problem, config, problem_subset_name): ########################################################################### # Handle fourd special case: kFoldWeightedRandomSampler. - elif name == 'kFoldWeightedRandomSampler': + elif typename == 'kFoldWeightedRandomSampler': # Check presence of the attribute. if 'weights' not in config: @@ -204,17 +204,17 @@ def build(problem, config, problem_subset_name): # Create the sampler object. sampler = ptp_samplers.kFoldWeightedRandomSampler(weights, len(problem), folds, epochs_per_fold, problem_subset_name == 'training') - elif name in ['BatchSampler', 'DistributedSampler']: + elif typename in ['BatchSampler', 'DistributedSampler']: # Sorry, don't support those. Yet;) raise ConfigurationError("Sampler Factory currently does not support the '{}' sampler. Please pick one of the others " - "or use defaults random sampling".format(name)) + "or use defaults random sampling".format(typename)) else: # Verify that the specified class is in the samplers package. - if name not in dir(pt_samplers): - raise ConfigurationError("Could not find the specified class '{}' in the samplers package".format(name)) + if typename not in dir(pt_samplers): + raise ConfigurationError("Could not find the specified class '{}' in the samplers package".format(typename)) # Get the sampler class. - sampler_class = getattr(pt_samplers, name) + sampler_class = getattr(pt_samplers, typename) # Create "regular" sampler. sampler = sampler_class(problem) diff --git a/ptp/components/masking/string_to_mask.py b/ptp/components/masking/string_to_mask.py index 671cc70..175521d 100644 --- a/ptp/components/masking/string_to_mask.py +++ b/ptp/components/masking/string_to_mask.py @@ -87,7 +87,7 @@ def __call__(self, data_dict): # Process samples 1 by 1. for i,sample in enumerate(strings): - assert not isinstance(sample, (list,)), 'This encoder requires input sample to contain a single word' + assert not isinstance(sample, (list,)), "This masking component requires input 'string' to contain a single word" # Process single token. if sample in self.word_to_ix.keys(): masks[i] = 1 diff --git a/ptp/components/problems/image_to_class/image_to_class_problem.py b/ptp/components/problems/image_to_class/image_to_class_problem.py index 4f31733..b049322 100644 --- a/ptp/components/problems/image_to_class/image_to_class_problem.py +++ b/ptp/components/problems/image_to_class/image_to_class_problem.py @@ -46,11 +46,6 @@ def __init__(self, name, class_type, config): # Call base class constructors. super(ImageToClassProblem, self).__init__(name, class_type, config) - # Get default key mappings. - self.key_inputs = self.stream_keys["inputs"] - self.key_targets = self.stream_keys["targets"] - - def show_sample(self, data_dict, sample_number=0): """ Shows a sample from the batch. diff --git a/ptp/components/problems/image_to_class/mnist.py b/ptp/components/problems/image_to_class/mnist.py index 43c2d98..8d0d784 100644 --- a/ptp/components/problems/image_to_class/mnist.py +++ b/ptp/components/problems/image_to_class/mnist.py @@ -57,7 +57,10 @@ def __init__(self, name, config): # Call base class constructors. super(MNIST, self).__init__(name, MNIST, config) - # Channel returning targets as words. + # Get default key mappings. + self.key_inputs = self.stream_keys["inputs"] + self.key_targets = self.stream_keys["targets"] + # Stream returning targets as words. self.key_labels = self.stream_keys["labels"] # Get absolute path. diff --git a/ptp/components/publishers/accuracy_statistics.py b/ptp/components/publishers/accuracy_statistics.py index c3d81f8..4ff9d9b 100644 --- a/ptp/components/publishers/accuracy_statistics.py +++ b/ptp/components/publishers/accuracy_statistics.py @@ -155,7 +155,7 @@ def add_statistics(self, stat_col): def collect_statistics(self, stat_col, data_dict): """ - Collects statistics (batch_size) for given episode. + Collects statistics (accuracy and support set size) for given episode. :param stat_col: ``StatisticsCollector``. @@ -188,13 +188,19 @@ def aggregate_statistics(self, stat_col, stat_agg): """ accuracies = stat_col[self.key_accuracy] - batch_sizes = stat_col[self.key_accuracy+'_support'] + supports = stat_col[self.key_accuracy+'_support'] - # Calculate weighted precision. - accuracies_avg = np.average(accuracies, weights=batch_sizes) - accuracies_var = np.average((accuracies-accuracies_avg)**2, weights=batch_sizes) + # Special case - no samples! + if sum(supports) == 0: + stat_agg[self.key_accuracy] = 0 + stat_agg[self.key_accuracy+'_std'] = 0 - stat_agg[self.key_accuracy] = accuracies_avg - #stat_agg[self.key_accuracy+'_min'] = np.min(accuracies) - #stat_agg[self.key_accuracy+'_max'] = np.max(accuracies) - stat_agg[self.key_accuracy+'_std'] = math.sqrt(accuracies_var) + else: + # Calculate weighted precision. + accuracies_avg = np.average(accuracies, weights=supports) + accuracies_var = np.average((accuracies-accuracies_avg)**2, weights=supports) + + stat_agg[self.key_accuracy] = accuracies_avg + #stat_agg[self.key_accuracy+'_min'] = np.min(accuracies) + #stat_agg[self.key_accuracy+'_max'] = np.max(accuracies) + stat_agg[self.key_accuracy+'_std'] = math.sqrt(accuracies_var) diff --git a/ptp/workers/offline_trainer.py b/ptp/workers/offline_trainer.py new file mode 100644 index 0000000..11a783e --- /dev/null +++ b/ptp/workers/offline_trainer.py @@ -0,0 +1,364 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) tkornuta, IBM Corporation 2019 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Tomasz Kornuta, Vincent Marois" + +import torch +import numpy as np + +from ptp.workers.trainer import Trainer +import ptp.configuration.config_parsing as config_parsing +from ptp.configuration.configuration_error import ConfigurationError +from ptp.utils.termination_condition import TerminationCondition + +class OfflineTrainer(Trainer): + """ + Implementation for the epoch-based ``OfflineTrainer``. + + ..note:: + + The default ``OfflineTrainer`` is based on epochs. \ + An epoch is defined as passing through all samples of a finite-size dataset.\ + The ``OfflineTrainer`` allows to loop over all samples from the training set many times i.e. in many epochs. \ + When an epochs finishes, it performs a similar step for the validation set and collects the statistics. + + """ + + def __init__(self): + """ + Constructor. It on calls the ``Trainer`` constructor as the initialization phase is identical to the one from ``Trainer``. + """ + # Call base constructor to set up app state, registry and add default config. + super(OfflineTrainer, self).__init__("OfflineTrainer", OfflineTrainer) + + def setup_experiment(self): + """ + Sets up experiment for episode trainer: + + - Calls base class setup_experiment to parse the command line arguments, + - Sets up the terminal conditions (loss threshold, episodes & epochs (optional) limits). + + """ + # Call base method to parse all command line arguments, load configuration, create problems and model etc. + super(OfflineTrainer, self).setup_experiment() + + # In this trainer Partial Validation is mandatory, hence interval must be > 0. + self.partial_validation_interval = self.config['validation']['partial_validation_interval'] + if self.partial_validation_interval <= 0: + self.logger.info("Partial Validation deactivated") + else: + self.logger.info("Partial Validation activated with interval equal to {} episodes".format(self.partial_validation_interval)) + + ################# TERMINAL CONDITIONS ################# + log_str = 'Terminal conditions:\n' + '='*80 + "\n" + + # Terminal condition I: loss. + self.loss_stop_threshold = self.config_training['terminal_conditions']['loss_stop_threshold'] + log_str += " I: Setting Loss Stop Threshold to {}\n".format(self.loss_stop_threshold) + + # Terminal condition II: early stopping. + self.early_stop_validations = self.config_training['terminal_conditions']['early_stop_validations'] + if self.early_stop_validations <= 0: + log_str += " II: Termination based on Early Stopping is disabled\n" + # Set to infinity. + self.early_stop_validations = np.Inf + else: + log_str += " II: Setting the Number of Validations in Early Stopping to: {}\n".format(self.early_stop_validations) + + # Terminal condition III: max epochs. Mandatory. + self.epoch_limit = self.config_training["terminal_conditions"]["epoch_limit"] + if self.epoch_limit <= 0: + self.logger.error("OffLine Trainer relies on epochs, thus Epoch Limit must be a positive number!") + exit(-5) + else: + log_str += " III: Setting the Epoch Limit to: {}\n".format(self.epoch_limit) + + # Log the epoch size in terms of episodes. + self.epoch_size = self.training.get_epoch_size() + log_str += " Epoch size in terms of training episodes: {}\n".format(self.epoch_size) + + # Terminal condition IV: max episodes. Optional. + self.episode_limit = self.config_training['terminal_conditions']['episode_limit'] + if self.episode_limit < 0: + log_str += " IV: Termination based on Episode Limit is disabled\n" + # Set to infinity. + self.episode_limit = np.Inf + else: + log_str += " IV: Setting the Episode Limit to: {}\n".format(self.episode_limit) + + # Ok, finally print it. + log_str += '='*80 + self.logger.info(log_str) + + # Export and log configuration, optionally asking the user for confirmation. + config_parsing.display_parsing_results(self.logger, self.app_state.args, self.unparsed) + config_parsing.display_globals(self.logger, self.app_state.globalitems()) + config_parsing.export_experiment_configuration_to_yml(self.logger, self.app_state.log_dir, "training_configuration.yml", self.config, self.app_state.args.confirm) + + def run_experiment(self): + """ + Main function of the ``OfflineTrainer``, runs the experiment. + + Iterates over the (cycled) DataLoader (one iteration = one episode). + + .. note:: + + The test for terminal conditions (e.g. convergence) is done at the end of each episode. \ + The terminal conditions are as follows: + + - I. The loss is below the specified threshold (using the partial validation loss), + - II. Early stopping is set and the full validation loss did went down \ + for the indicated number of validation steps, + - III. The maximum number of episodes has been met (OPTIONAL), + - IV. The maximum number of epochs has been met. + + Additionally, experiment can be stopped by the user by pressing 'Stop experiment' \ + during visualization. + + + The function does the following for each episode: + + - Handles curriculum learning if set, + - Resets the gradients + - Forwards pass of the model, + - Logs statistics and exports to TensorBoard (if set), + - Computes gradients and update weights + - Activate visualization if set, + - Validate the model on a batch according to the validation frequency. + - Checks the above terminal conditions. + + + """ + # Initialize TensorBoard and statistics collection. + self.initialize_statistics_collection() + self.initialize_tensorboard() + + try: + ''' + Main training and validation loop. + ''' + # Reset the counters. + self.app_state.episode = -1 + self.app_state.epoch = -1 + # Set initial status. + training_status = "Not Converged" + + ################################################################################################ + # Beginning of external "epic loop". + ################################################################################################ + while(True): + self.app_state.epoch += 1 + self.logger.info('Starting next epoch: {}\n{}'.format(self.app_state.epoch, '='*80)) + + # Inform the problem managers that epoch has started. + self.training.initialize_epoch() + self.validation.initialize_epoch() + + # Apply curriculum learning - change Problem parameters. + self.curric_done = self.training.problem.curriculum_learning_update_params( + 0 if self.app_state.episode < 0 else self.app_state.episode, + self.app_state.epoch) + + + # Empty the statistics collector. + self.training_stat_col.empty() + + ############################################################################################ + # Beginning of internal "episodic loop". + ############################################################################################ + for training_batch in self.training.dataloader: + # Next episode. + self.app_state.episode += 1 + + # reset all gradients + self.optimizer.zero_grad() + + # Turn on training mode for the model. + self.pipeline.train() + + # 1. Perform forward step. + self.pipeline.forward(training_batch) + + # 2. Calculate statistics. + self.collect_all_statistics(self.training, self.pipeline, training_batch, self.training_stat_col) + + # 3. Backward gradient flow. + self.pipeline.backward(training_batch) + + # Check the presence of the 'gradient_clipping' parameter. + try: + # if present - clip gradients to a range (-gradient_clipping, gradient_clipping) + val = self.config_training['gradient_clipping'] + torch.nn.utils.clip_grad_value_(self.pipeline.parameters(), val) + except KeyError: + # Else - do nothing. + pass + + # 4. Perform optimization. + self.optimizer.step() + + # 5. Log collected statistics. + # 5.1. Export to csv - at every step. + self.training_stat_col.export_to_csv() + + # 5.2. Export data to TensorBoard - at logging frequency. + if (self.training_batch_writer is not None) and \ + (self.app_state.episode % self.app_state.args.logging_interval == 0): + self.training_stat_col.export_to_tensorboard() + + # Export histograms. + if self.app_state.args.tensorboard >= 1: + for name, param in self.pipeline.named_parameters(): + try: + self.training_batch_writer.add_histogram(name, + param.data.cpu().numpy(), self.app_state.episode, bins='doane') + + except Exception as e: + self.logger.error(" {} :: data :: {}".format(name, e)) + + # Export gradients. + if self.app_state.args.tensorboard >= 2: + for name, param in self.pipeline.named_parameters(): + try: + self.training_batch_writer.add_histogram(name + '/grad', + param.grad.data.cpu().numpy(), self.app_state.episode, bins='doane') + + except Exception as e: + self.logger.error(" {} :: grad :: {}".format(name, e)) + + # 5.3. Log to logger - at logging frequency. + if self.app_state.episode % self.app_state.args.logging_interval == 0: + self.logger.info(self.training_stat_col.export_to_string()) + + # 6. Validate and (optionally) save the model. + if self.partial_validation_interval > 0 and (self.app_state.episode % self.partial_validation_interval) == 0: + # Clear the validation batch from all items aside of the ones originally returned by the problem. + self.validation.batch.reinitialize(self.validation.problem.output_data_definitions()) + # Perform validation. + self.validate_on_batch(self.validation.batch) + # Do not save the model: OfflineTrainer uses the full set to determine whether to save or not. + + # III. The episodes number limit has been reached. + if self.app_state.episode+1 >= self.episode_limit: # = np.Inf when inactive. + # If we reach this condition, then it is possible that the model didn't converge correctly + # but it currently might get better since last validation. + training_status = "Not converged: Episode Limit reached" + raise TerminationCondition(training_status) + + ############################################################################################ + # End of internal "episodic loop". + ############################################################################################ + + # Epoch just ended! + self.logger.info('End of epoch: {}\n{}'.format(self.app_state.epoch, '='*80)) + + # Aggregate training statistics for the epoch. + self.aggregate_all_statistics(self.training, self.pipeline, self.training_stat_col, self.training_stat_agg) + self.export_all_statistics( self.training_stat_agg, '[Full Training]') + + # Inform the training problem manager that the epoch has ended. + self.training.finalize_epoch() + + # Validate over the entire validation set. + self.validate_on_set() + + # Get loss. + validation_set_loss = self.pipeline.return_loss_on_set(self.validation_stat_agg) + + # Save the pipeline using the latest validation statistics. + self.pipeline.save(self.checkpoint_dir, training_status, validation_set_loss) + + # Inform the validation problem manager that the epoch has ended. + self.validation.finalize_epoch() + + # Terminal conditions. + # I. The loss is < threshold (only when curriculum learning is finished if set). + # We check that condition only in validation step! + if self.curric_done or not self.must_finish_curriculum: + + # Check the Partial Validation loss. + if (validation_set_loss < self.loss_stop_threshold): + # Change the status. + training_status = "Converged (Full Validation Loss went below " \ + "Loss Stop threshold of {})".format(self.loss_stop_threshold) + + # Save the pipeline (update its statistics). + self.pipeline.save(self.checkpoint_dir, training_status, validation_set_loss) + # And leave both loops. + raise TerminationCondition(training_status) + + # II. Early stopping is set and loss hasn't improved by delta in n epochs. + if self.pipeline.validation_loss_down_counter >= self.early_stop_validations: + training_status = "Not converged: reached limit of validations without improvement (Early Stopping)" + raise TerminationCondition(training_status) + + # IV. Epoch limit has been reached. + if self.app_state.epoch+1 >= self.epoch_limit: + training_status = "Not converged: Epoch Limit reached" + # "Finish" the training. + raise TerminationCondition(training_status) + + ################################################################################################ + # End of external "epic loop". + ################################################################################################ + + except TerminationCondition as e: + # End of main training and validation loop. Perform final full validation. + + self.logger.info('\n' + '='*80) + self.logger.info('Training finished because {}'.format(training_status)) + + # If episode limit was reached - perform last validation on the full set. + if training_status == "Not converged: Episode Limit reached": + # Validate over the entire validation set. + self.validate_on_set() + # Get loss. + validation_set_loss = self.pipeline.return_loss_on_set(self.validation_stat_agg) + # Save the pipeline using the latest validation statistics. + self.pipeline.save(self.checkpoint_dir, training_status, validation_set_loss) + + + self.logger.info('Experiment finished!') + + except SystemExit as e: + # the training did not end properly + self.logger.error('Experiment interrupted because {}'.format(e)) + except ConfigurationError as e: + # the training did not end properly + self.logger.error('Experiment interrupted because {}'.format(e)) + except KeyboardInterrupt: + # the training did not end properly + self.logger.error('Experiment interrupted!') + finally: + # Finalize statistics collection. + self.finalize_statistics_collection() + self.finalize_tensorboard() + self.logger.info("Experiment logged to: {}".format(self.app_state.log_dir)) + + +def main(): + """ + Entry point function for the ``OfflineTrainer``. + """ + # Create trainer. + trainer = OfflineTrainer() + # Parse args, load configuration and create all required objects. + trainer.setup_experiment() + # GO! + trainer.run_experiment() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/ptp/workers/online_trainer.py b/ptp/workers/online_trainer.py index 1361c9e..165c93e 100644 --- a/ptp/workers/online_trainer.py +++ b/ptp/workers/online_trainer.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__author__ = "Vincent Marois, Tomasz Kornuta" +__author__ = "Tomasz Kornuta, Vincent Marois" import torch import numpy as np @@ -60,7 +60,7 @@ def setup_experiment(self): # In this trainer Partial Validation is mandatory, hence interval must be > 0. self.partial_validation_interval = self.config['validation']['partial_validation_interval'] if self.partial_validation_interval <= 0: - self.logger.error("Episodic Trainer relies on Partial Validation, thus 'partial_validation_interval' must be a positive number!") + self.logger.error("Online Trainer relies on Partial Validation, thus 'partial_validation_interval' must be a positive number!") exit(-4) else: self.logger.info("Partial Validation activated with interval equal to {} episodes\n".format(self.partial_validation_interval)) @@ -69,29 +69,38 @@ def setup_experiment(self): log_str = 'Terminal conditions:\n' + '='*80 + "\n" # Terminal condition I: loss. - self.loss_stop = self.config_training['terminal_conditions']['loss_stop'] - log_str += " Setting Loss Stop threshold to {}\n".format(self.loss_stop) + self.loss_stop_threshold = self.config_training['terminal_conditions']['loss_stop_threshold'] + log_str += " I: Setting Loss Stop Threshold to {}\n".format(self.loss_stop_threshold) - # Terminal condition II: max epochs. Optional. + # Terminal condition II: early stopping. + self.early_stop_validations = self.config_training['terminal_conditions']['early_stop_validations'] + if self.early_stop_validations <= 0: + log_str += " II: Termination based on Early Stopping is disabled\n" + # Set to infinity. + self.early_stop_validations = np.Inf + else: + log_str += " II: Setting the Number of Validations in Early Stopping to: {}\n".format(self.early_stop_validations) + + # Terminal condition III: max epochs (Optional for this trainer) self.epoch_limit = self.config_training["terminal_conditions"]["epoch_limit"] if self.epoch_limit <= 0: - log_str += " Termination based on Epoch Limit is disabled\n" + log_str += " III: Termination based on Epoch Limit is disabled\n" # Set to infinity. self.epoch_limit = np.Inf else: - log_str += " Setting the Epoch Limit to: {}\n".format(self.epoch_limit) + log_str += " III: Setting the Epoch Limit to: {}\n".format(self.epoch_limit) - # Calculate the epoch size in terms of episodes. - self.epoch_size = len(self.training) - log_str += " Epoch size in terms of training episodes: {}\n".format(self.epoch_size) + # Log the epoch size in terms of episodes. + self.epoch_size = self.training.get_epoch_size() + log_str += " Epoch size in terms of training episodes: {}\n".format(self.epoch_size) - # Terminal condition III: max episodes. Mandatory. + # Terminal condition IV: max episodes. Mandatory. self.episode_limit = self.config_training['terminal_conditions']['episode_limit'] if self.episode_limit <= 0: self.logger.error("OnLine Trainer relies on episodes, thus 'episode_limit' must be a positive number!") exit(-5) else: - log_str += " Setting the Episode Limit to: {}\n".format(self.episode_limit) + log_str += " IV: Setting the Episode Limit to: {}\n".format(self.episode_limit) # Ok, finally print it. log_str += '='*80 self.logger.info(log_str) @@ -113,8 +122,8 @@ def run_experiment(self): The terminal conditions are as follows: - I. The loss is below the specified threshold (using the partial validation loss), - - TODO: II. Early stopping is set and the full validation loss did not change by delta \ - for the indicated number of epochs, + - II. Early stopping is set and the full validation loss did went down \ + for the indicated number of validation steps, - III. The maximum number of episodes has been met, - IV. The maximum number of epochs has been met (OPTIONAL). @@ -244,30 +253,31 @@ def run_experiment(self): # Perform validation. self.validate_on_batch(self.validation.batch) # Get loss. - validation_loss = self.pipeline.get_loss(self.validation.batch) + validation_batch_loss = self.pipeline.return_loss_on_batch(self.validation_stat_col) # Save the pipeline using the latest validation statistics. - self.pipeline.save(self.checkpoint_dir, training_status, validation_loss) + self.pipeline.save(self.checkpoint_dir, training_status, validation_batch_loss) # Terminal conditions. - # I. the loss is < threshold (only when curriculum learning is finished if set.) + # I. The loss is < threshold (only when curriculum learning is finished if set). # We check that condition only in validation step! if self.curric_done or not self.must_finish_curriculum: # Check the Partial Validation loss. - if (validation_loss < self.loss_stop): + if (validation_batch_loss < self.loss_stop_threshold): # Change the status. training_status = "Converged (Partial Validation Loss went below " \ - "Loss Stop threshold)" + "Loss Stop threshold {})".format(self.loss_stop_threshold) # Save the pipeline (update its statistics). - self.pipeline.save(self.checkpoint_dir, training_status, validation_loss) + self.pipeline.save(self.checkpoint_dir, training_status, validation_batch_loss) # And leave both loops. raise TerminationCondition(training_status) # II. Early stopping is set and loss hasn't improved by delta in n epochs. - # early_stopping(index=epoch, avg_valid_loss). (TODO) - # training_status = 'Early Stopping.' + if self.pipeline.validation_loss_down_counter >= self.early_stop_validations: + training_status = "Not converged: reached limit of validations without improvement (Early Stopping)" + raise TerminationCondition(training_status) # III. The episodes number limit has been reached. if self.app_state.episode+1 >= self.episode_limit: @@ -305,15 +315,17 @@ def run_experiment(self): # End of main training and validation loop. Perform final full validation. # Eventually perform "last" validation on batch. if self.validation_stat_col["episode"][-1] != self.app_state.episode: - # We still must validate and try to save the model as it may perform better during this episode. + # We still must validate and try to save the model as it may performed better during this episode. # Clear the validation batch from all items aside of the ones originally returned by the problem. self.validation.batch.reinitialize(self.validation.problem.output_data_definitions()) # Perform validation. self.validate_on_batch(self.validation.batch) + # Get loss. + validation_batch_loss = self.pipeline.return_loss_on_batch(self.validation_stat_col) # Try to save the model using the latest validation statistics. - self.pipeline.save(self.checkpoint_dir, training_status, validation_loss) + self.pipeline.save(self.checkpoint_dir, training_status, validation_batch_loss) self.logger.info('\n' + '='*80) self.logger.info('Training finished because {}'.format(training_status)) @@ -344,18 +356,12 @@ def main(): """ Entry point function for the ``OnlineTrainer``. """ - try: - # Create trainer. - trainer = OnlineTrainer() - # Parse args, load configuration and create all required objects. - trainer.setup_experiment() - # GO! - trainer.run_experiment() - - except KeyError as e: - print("Error: {}".format(e)) - exit(-1) - + # Create trainer. + trainer = OnlineTrainer() + # Parse args, load configuration and create all required objects. + trainer.setup_experiment() + # GO! + trainer.run_experiment() if __name__ == '__main__': main() \ No newline at end of file diff --git a/ptp/workers/processor.py b/ptp/workers/processor.py index f8c189b..9edad60 100644 --- a/ptp/workers/processor.py +++ b/ptp/workers/processor.py @@ -433,19 +433,13 @@ def main(): Entry point function for the ``Processor``. """ - try: - processor = Processor() - # parse args, load configuration and create all required objects. - processor.setup_global_experiment() - # finalize the experiment setup - processor.setup_individual_experiment() - # run the experiment - processor.run_experiment() - - except KeyError as e: - print("Error: {}".format(e)) - exit(-1) - + processor = Processor() + # parse args, load configuration and create all required objects. + processor.setup_global_experiment() + # finalize the experiment setup + processor.setup_individual_experiment() + # run the experiment + processor.run_experiment() if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index b7ba573..ca9a2e7 100644 --- a/setup.py +++ b/setup.py @@ -212,6 +212,7 @@ entry_points={ # Optional 'console_scripts': [ 'ptp-online-trainer=ptp.workers.online_trainer:main', + 'ptp-offline-trainer=ptp.workers.offline_trainer:main', 'ptp-processor=ptp.workers.processor:main', ] }, diff --git a/tests/sampler_factory_tests.py b/tests/sampler_factory_tests.py index dae63c6..70e3ce5 100644 --- a/tests/sampler_factory_tests.py +++ b/tests/sampler_factory_tests.py @@ -39,7 +39,7 @@ def test_create_subset_random_sampler_range(self): indices = range(20) config = ConfigInterface() - config.add_default_params({'name': 'SubsetRandomSampler', + config.add_default_params({'type': 'SubsetRandomSampler', 'indices': indices}) # Create the sampler. sampler = SamplerFactory.build(TestProblemMockup(), config, "training") @@ -52,7 +52,7 @@ def test_create_subset_random_sampler_range_str(self): range_str = '0, 20' config = ConfigInterface() - config.add_default_params({'name': 'SubsetRandomSampler', + config.add_default_params({'type': 'SubsetRandomSampler', 'indices': range_str}) # Create the sampler. sampler = SamplerFactory.build(TestProblemMockup(), config, "training") @@ -66,7 +66,7 @@ def test_create_subset_random_sampler_list_of_indices(self): yaml_list = yaml.load('[0, 2, 5, 10]') config = ConfigInterface() - config.add_default_params({'name': 'SubsetRandomSampler', + config.add_default_params({'type': 'SubsetRandomSampler', 'indices': yaml_list}) # Create the sampler. sampler = SamplerFactory.build(TestProblemMockup(), config, "training") @@ -85,7 +85,7 @@ def test_create_subset_random_sampler_file(self): indices.tofile(filename, sep=',', format="%s") config = ConfigInterface() - config.add_default_params({'name': 'SubsetRandomSampler', + config.add_default_params({'type': 'SubsetRandomSampler', 'indices': filename}) # Create the sampler. sampler = SamplerFactory.build(TestProblemMockup(), config, "training")