diff --git a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml index 715cb17..bc45a9e 100644 --- a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml +++ b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml @@ -55,6 +55,23 @@ answer_preprocessing: none # Those weights can be next used by weighted samplers (e.g. kFoldWeightedSampler) export_sample_weights: '' +# Shuffle the indices of the input (source) files/samples. +# Leaving that to false will results in the original order of files samples, +# i.e. C1, then C2, then C3 etc. +shuffle_indices: False + +# Generate and export (potentially shuffled) indices (LOADED) +# If not empty, will: +# * shuffle indices of all samples and export them to a file. +# * use those indices during sampling. +export_indices: '' + +# Import (potentially shuffled) indices (LOADED) +# If not empty, will: +# * import them to a file. +# * use those indices during sampling. +import_indices: '' + streams: #################################################################### # 2. Keymappings associated with INPUT and OUTPUT streams. diff --git a/configs/vqa_med_2019/default_vqa_med_2019.yml b/configs/vqa_med_2019/default_vqa_med_2019.yml index dfe01a6..589c791 100644 --- a/configs/vqa_med_2019/default_vqa_med_2019.yml +++ b/configs/vqa_med_2019/default_vqa_med_2019.yml @@ -6,11 +6,15 @@ training: split: training_validation resize_image: &resize_image [224, 224] batch_size: 64 + # Generate and export shuffled indices. + shuffle_indices: True + export_indices: shuffled_indices.npy # Default sampler during training. sampler: name: kFoldWeightedRandomSampler - folds: 5 + folds: 10 + epochs_per_fold: 20 # Use four workers for loading images. dataloader: num_workers: 4 @@ -35,10 +39,14 @@ validation: split: training_validation resize_image: *resize_image batch_size: 64 + # Import shuffled indices. + import_indices: shuffled_indices.npy + # Default sampler during validation. sampler: name: kFoldRandomSampler - folds: 5 + folds: 10 + epochs_per_fold: 20 # Use four workers for loading images. dataloader: num_workers: 4 diff --git a/configs/vqa_med_2019/question_categorization/default_question_categorization.yml b/configs/vqa_med_2019/question_categorization/default_question_categorization.yml index 4a0f255..0e7e5c0 100644 --- a/configs/vqa_med_2019/question_categorization/default_question_categorization.yml +++ b/configs/vqa_med_2019/question_categorization/default_question_categorization.yml @@ -31,7 +31,7 @@ pipeline: import_word_mappings_from_globals: True streams: inputs: predictions - outputs: predicted_categories + outputs: predicted_category_names globals: vocabulary_size: num_categories word_mappings: category_word_mappings @@ -72,6 +72,6 @@ pipeline: viewer: type: StreamViewer priority: 100.4 - input_streams: questions,category_names,predicted_categories + input_streams: questions,category_names,predicted_category_names #: pipeline diff --git a/ptp/application/sampler_factory.py b/ptp/application/sampler_factory.py index 934e6e4..0af1b32 100644 --- a/ptp/application/sampler_factory.py +++ b/ptp/application/sampler_factory.py @@ -177,9 +177,11 @@ def build(problem, config, problem_subset_name): folds = config["folds"] if folds < 2: raise ConfigurationError("kFoldRandomSampler requires at least two 'folds'") + # Get epochs per fold (default: 1). + epochs_per_fold = config.get("epochs_per_fold", 1) # Create the sampler object. - sampler = ptp_samplers.kFoldRandomSampler(len(problem), folds, problem_subset_name == 'training') + sampler = ptp_samplers.kFoldRandomSampler(len(problem), folds, epochs_per_fold, problem_subset_name == 'training') ########################################################################### # Handle fourd special case: kFoldWeightedRandomSampler. @@ -202,9 +204,11 @@ def build(problem, config, problem_subset_name): folds = config["folds"] if folds < 2: raise ConfigurationError("kFoldRandomSampler requires at least two 'folds'") + # Get epochs per fold (default: 1). + epochs_per_fold = config.get("epochs_per_fold", 1) # Create the sampler object. - sampler = ptp_samplers.kFoldWeightedRandomSampler(weights, len(problem), folds, problem_subset_name == 'training') + sampler = ptp_samplers.kFoldWeightedRandomSampler(weights, len(problem), folds, epochs_per_fold, problem_subset_name == 'training') elif name in ['BatchSampler', 'DistributedSampler']: # Sorry, don't support those. Yet;) diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py index ed0d2e2..21dec85 100644 --- a/ptp/components/problems/image_text_to_class/vqa_med_2019.py +++ b/ptp/components/problems/image_text_to_class/vqa_med_2019.py @@ -258,12 +258,29 @@ def __init__(self, name, config): source_image_folder = os.path.join(split_folder, 'VQAMed2019_Test_Images') self.dataset = self.load_testset(source_file, source_image_folder) + # Ok, now we got the whole dataset (for given "split"). + self.ix = np.arange(len(self.dataset)) + if self.config["import_indices"] != '': + # Try to load indices from the file. + self.ix = np.load(os.path.join(self.app_state.log_dir, self.config["import_indices"])) + self.logger.info("Imported indices from '{}'".format(os.path.join(self.app_state.log_dir, self.config["export_indices"]))) + else: + # Ok, check whether we want to shuffle. + if self.config["shuffle_indices"]: + np.random.shuffle(self.ix) + # Export if required. + if self.config["export_indices"] != '': + # export indices to file. + np.save(os.path.join(self.app_state.log_dir, self.config["export_indices"]), self.ix) + self.logger.info("Exported indices to '{}'".format(os.path.join(self.app_state.log_dir, self.config["export_indices"]))) + # Display exemplary sample. - self.logger.info("Exemplary sample:\n [ category: {}\t image_ids: {}\t question: {}\t answer: {} ]".format( - self.dataset[0][self.key_category_ids], - self.dataset[0][self.key_image_ids], - self.dataset[0][self.key_questions], - self.dataset[0][self.key_answers] + self.logger.info("Exemplary sample 0 ({}):\n [ category: {}\t image_ids: {}\t question: {}\t answer: {} ]".format( + self.ix[0], + self.category_idx_to_word[self.dataset[self.ix[0]][self.key_category_ids]], + self.dataset[self.ix[0]][self.key_image_ids], + self.dataset[self.ix[0]][self.key_questions], + self.dataset[self.ix[0]][self.key_answers] )) # Check if we want the problem to calculate and export the weights. @@ -703,7 +720,7 @@ def __getitem__(self, index): :return: DataDict({'indices', 'images', 'images_ids','questions', 'answers', 'category_ids', 'image_sizes'}) """ # Get item. - item = self.dataset[index] + item = self.dataset[self.ix[index]] # Create the resulting sample (data dict). data_dict = self.create_data_dict(index) diff --git a/ptp/utils/samplers.py b/ptp/utils/samplers.py index b99bc3a..e70e979 100644 --- a/ptp/utils/samplers.py +++ b/ptp/utils/samplers.py @@ -33,15 +33,19 @@ class kFoldRandomSampler(Sampler): Every time __iter__() method is called, it moves to next fold/set of folds. """ - def __init__(self, num_samples, num_folds, all_but_current_fold = True): + def __init__(self, num_samples, num_folds, epochs_per_fold = 1, all_but_current_fold = True): """ Initializes the sampler by generating the indices associated with the fold(s) that are to be used. - :param num_samples: Size of the dataset + :param num_samples: Size of the dataset + :param num_folds: Number of folds - :param all_but_current_fold: Operation mode (DEFAULT: True): - When True, generates indices for all-but-one folds (for training). \ - When False, generates indices for only one fold (for validation). \ + + :param epochs_per_fold: Number of epochs that need to pass before sampler moves to next fold(s) (DEFAULT: 1) + + :param all_but_current_fold: Operation mode (DEFAULT: True): \ + When True, generates indices for all-but-one folds (for training) \ + When False, generates indices for only one fold (for validation) """ # Get number of samples (size of "whole dataset"). if not isinstance(num_samples, _int_classes) or isinstance(num_samples, bool) or \ @@ -55,12 +59,23 @@ def __init__(self, num_samples, num_folds, all_but_current_fold = True): num_folds <= 0: raise ValueError("num_folds should be a positive integeral " "value, but got num_folds={}".format(num_folds)) - self.num_folds = num_folds + # Get number epochs per fold. + if not isinstance(epochs_per_fold, _int_classes) or isinstance(epochs_per_fold, bool) or \ + epochs_per_fold <= 0: + raise ValueError("epochs_per_fold should be a positive integeral " + "value, but got num_folds={}".format(epochs_per_fold)) + + # Store fold-related parameres. self.all_but_current_fold = all_but_current_fold - # Initialize current "fold" as -1, so then dataloder will call next() for the first time - # it will return samples for 0-th fold/all-but-0th fold. - self.current_fold = -1 + self.num_folds = num_folds + self.epochs_per_fold = epochs_per_fold + + # Initialize current "fold" so it will return samples for 0-th fold/all-but-0th fold. + self.current_fold = 0 + # "Left epochs": +1 is related to "initial", additional generation of indices - below. + self.epochs_left = self.epochs_per_fold +1 + # Generate "initial" indices. self.indices = self.regenerate_indices() @@ -73,9 +88,7 @@ def regenerate_indices(self): # Fold size and indices. all_indices = range(self.num_samples) fold_size = ceil(self.num_samples / self.num_folds) - - # Modulo current fold number by total number of folds. - fold = self.current_fold % self.num_folds + fold = self.current_fold # Generate indices associated with the given fold / all except the given fold. if self.all_but_current_fold: @@ -106,11 +119,17 @@ def __iter__(self): """ Return "shuffled" indices. """ - # Next fold. - self.current_fold += 1 + # "Decrease" the number of epochs with this fold. + self.epochs_left = self.epochs_left - 1 + if self.epochs_left <= 0: + # Next fold, modulo by the total number of folds. + self.current_fold = (self.current_fold + 1) % self.num_folds - # Regenerate indices. - self.indices = self.regenerate_indices() + # Regenerate indices. + self.indices = self.regenerate_indices() + + # Reset epochs counter. + self.epochs_left = self.epochs_per_fold # Return permutated indices. return (self.indices[i] for i in torch.randperm(len(self.indices))) @@ -132,23 +151,30 @@ class kFoldWeightedRandomSampler(kFoldRandomSampler): Every time __iter__() method is called, it moves to next fold/set of folds. """ - def __init__(self, weights, num_samples, num_folds, all_but_current_fold = True, replacement=True): + def __init__(self, weights, num_samples, num_folds, epochs_per_fold = 1, all_but_current_fold = True, replacement=True): """ Initializes the sampler by generating the indices associated with the fold(s) that are to be used. - :param num_samples: Size of the dataset + :param num_samples: Size of the dataset + :param num_folds: Number of folds - :param all_but_current_fold: Operation mode (DEFAULT: True): - When True, generates indices for all-but-one folds (for training). \ - When False, generates indices for only one fold (for validation). \ + + :param epochs_per_fold: Number of epochs that need to pass before sampler moves to next fold(s) (DEFAULT: 1) + + :param all_but_current_fold: Operation mode (DEFAULT: True): \ + When True, generates indices for all-but-one folds (for training) \ + When False, generates indices for only one fold (for validation) + :params weights: a sequence of weights, not necessary summing up to one + :param num_samples: number of samples to draw + :param replacement: if ``True``, samples are drawn with replacement. If not, they are drawn without replacement, which means that when a sample index is drawn for a row, it cannot be drawn again for that row. """ # Call k-fold base class constructor. - super().__init__(num_samples, num_folds, all_but_current_fold) + super().__init__(num_samples, num_folds, epochs_per_fold, all_but_current_fold) # Get replacement flag. if not isinstance(replacement, bool): raise ValueError("replacement should be a boolean value, but got " @@ -159,12 +185,17 @@ def __init__(self, weights, num_samples, num_folds, all_but_current_fold = True, self.weights = torch.tensor(weights, dtype=torch.double) def __iter__(self): - # Next fold. - self.current_fold += 1 + # "Decrease" the number of epochs with this fold. + self.epochs_left = self.epochs_left - 1 + if self.epochs_left <= 0: + # Next fold, modulo by the total number of folds. + self.current_fold = (self.current_fold + 1) % self.num_folds - # Regenerate indices. - self.indices = self.regenerate_indices() + # Regenerate indices. + self.indices = self.regenerate_indices() + # Reset epochs counter. + self.epochs_left = self.epochs_per_fold # Select the corresponging weights. weights = torch.take(self.weights, torch.tensor(self.indices)) diff --git a/tests/samplers_tests.py b/tests/samplers_tests.py index a228ca0..53ca272 100644 --- a/tests/samplers_tests.py +++ b/tests/samplers_tests.py @@ -68,6 +68,33 @@ def test_kfold_random_sampler_current_fold(self): self.assertIn(ix, indices) + def test_kfold_random_sampler_current_fold_10epochs(self): + """ Tests the k-fold sampler current_fold mode. """ + + # Create the sampler. + sampler = kFoldRandomSampler(20, 3, 10, all_but_current_fold=False) + + # First 10 epochs - the same indices from 0-7 range. + for _ in range(10): + # Test zero-th fold. + indices = list(iter(sampler)) + # Check number of samples. + self.assertEqual(len(indices), 7) + # Check presence of all indices. + for ix in range(0,7): + self.assertIn(ix, indices) + + # Next 10 epochs - the same indices from 7-14 range. + for _ in range(10): + # Test zero-th fold. + indices = list(iter(sampler)) + # Check number of samples. + self.assertEqual(len(indices), 7) + # Check presence of all indices. + for ix in range(7,14): + self.assertIn(ix, indices) + + def test_kfold_random_sampler_all_but_current_fold(self): """ Tests the k-fold sampler all_but_current_fold mode. """