diff --git a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml index 3846b45..715cb17 100644 --- a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml +++ b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml @@ -15,6 +15,14 @@ split: training # Options: all | c1 | c2 | c3 | c4 (or any combination of the latter 4) categories: all +# Flag indicating whether the problem will load and return images (LOADED) +stream_images: True + +# Flag indicating whether images will be preloaded (i.e. loaded once at start) (LOADED) +# WARNING: if this option is active, the images will also be "preprocessed" at start. +# This means that preloading should not be used when one needs to use the random augmentations! +preload_images: False + # Resize parameter (LOADED) # When present, resizes the images from original size to [height, width] # Depth remains set to 3. diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml index 5ee97e7..17168c2 100644 --- a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml +++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml @@ -1,9 +1,28 @@ # Load config defining problems for training, validation and testing. default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml -pipeline: - name: c4_word_answer_onehot_bow +# Training parameters: +training: + problem: + categories: C4 + batch_size: 512 + # In here we won't use images at all. + stream_images: False + dataloader: + num_workers: 0 + +# Validation parameters: +validation: + problem: + categories: C4 + batch_size: 512 + # In here we won't use images at all. + stream_images: False + dataloader: + num_workers: 0 + +pipeline: # Answer encoding. answer_tokenizer: type: SentenceTokenizer diff --git a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml index b3b7576..958161a 100644 --- a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml +++ b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml @@ -38,7 +38,9 @@ hyperparameters: # Final classifier: FFN. answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [83] - batch_size: &batch_size 64 + batch_size: &batch_size 200 + preload_images: &preload_images True + num_workers: &num_workers 0 # Training parameters: training: @@ -49,10 +51,15 @@ training: # Appy all preprocessing/data augmentations. question_preprocessing: *question_preprocessing image_preprocessing: *image_preprocessing + # Preload images. + preload_images: *preload_images streams: questions: tokenized_questions sampler: weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv + # Use four workers for loading images. + dataloader: + num_workers: *num_workers # Optimizer parameters: optimizer: @@ -67,14 +74,19 @@ training: # Validation parameters: validation: + partial_validation_interval: 100 problem: batch_size: *batch_size categories: C1,C2,C3 # Appy all preprocessing/data augmentations. question_preprocessing: *question_preprocessing image_preprocessing: *image_preprocessing + # Preload images. + preload_images: *preload_images streams: questions: tokenized_questions + dataloader: + num_workers: *num_workers pipeline: diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py index e156eba..8e94418 100644 --- a/ptp/components/problems/image_text_to_class/vqa_med_2019.py +++ b/ptp/components/problems/image_text_to_class/vqa_med_2019.py @@ -85,6 +85,12 @@ def __init__(self, name, config): self.key_category_names = self.stream_keys["category_names"] self.key_image_sizes = self.stream_keys["image_sizes"] + # Get flag informing whether we want to stream images or not. + self.stream_images = self.config['stream_images'] + + # Get flag indicating whether we want to (pre)aload all images at the start. + self.preload_images = self.config['preload_images'] + # Check the desired image size. if len(self.config['resize_image']) != 2: self.logger.error("'resize_image' field must contain 2 values: the desired height and width") @@ -275,12 +281,15 @@ def output_data_definitions(self): # Add all "standard" streams. d = { self.key_indices: DataDefinition([-1, 1], [list, int], "Batch of sample indices [BATCH_SIZE] x [1]"), - self.key_images: DataDefinition([-1, self.depth, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]"), self.key_image_ids: DataDefinition([-1, 1], [list, str], "Batch of image names, each being a single word [BATCH_SIZE] x [STRING]"), - self.key_image_sizes: DataDefinition([-1, 2], [torch.Tensor], "Batch of original sizes (height, width) of images [BATCH_SIZE x 2]"), self.key_category_ids: DataDefinition([-1], [torch.Tensor], "Batch of target category indices, each being a single index [BATCH_SIZE]"), self.key_category_names: DataDefinition([-1, 1], [list, str], "Batch of category target names, each being a single word [BATCH_SIZE] x [STRING]"), } + + # Return images only when required. + if self.stream_images: + d[self.key_images] = DataDefinition([-1, self.depth, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]") + d[self.key_image_sizes] = DataDefinition([-1, 2], [torch.Tensor], "Batch of original sizes (height, width) of images [BATCH_SIZE x 2]") # Add stream with questions. if 'tokenize' in self.question_preprocessing: @@ -541,8 +550,8 @@ def load_dataset(self, source_files, source_image_folders, source_categories): False ) - # Add record to dataset. - dataset.append({ + # Create item "dictionary". + item = { # Image name and path leading to it. self.key_image_ids: row[self.key_image_ids], "image_folder": image_folder, @@ -550,7 +559,16 @@ def load_dataset(self, source_files, source_image_folders, source_categories): self.key_answers: preprocessed_answer, # Add category. self.key_category_ids: category - }) + } + + # Preload image. + if self.preload_images and self.stream_images: + img, img_size = self.get_image(row[self.key_image_ids], image_folder) + item[self.key_images] = img + item[self.key_image_sizes] = img_size + + # Add item to dataset. + dataset.append(item) t.update() t.close() @@ -601,8 +619,8 @@ def load_testset(self, data_file, image_folder): else: preprocessed_answer = answer - # Add record to dataset. - dataset.append({ + # Create item "dictionary". + item = { # Image name and path leading to it. self.key_image_ids: row[self.key_image_ids], "image_folder": image_folder, @@ -610,7 +628,16 @@ def load_testset(self, data_file, image_folder): self.key_answers: preprocessed_answer, # Add category. self.key_category_ids: category_id - }) + } + + # Preload image. + if self.preload_images and self.stream_images: + img, img_size = self.get_image(row[self.key_image_ids], image_folder) + item[self.key_images] = img + item[self.key_image_sizes] = img_size + + # Add item to dataset. + dataset.append(item) t.update() t.close() @@ -619,22 +646,17 @@ def load_testset(self, data_file, image_folder): # Return the created list. return dataset - - def __getitem__(self, index): + def get_image(self, img_id, img_folder): """ - Getter method to access the dataset and return a single sample. + Function loads and returns image along with its size. + Additionally, it performs all the required transformations. - :param index: index of the sample to return. - :type index: int + :param img_id: Identifier of the images. + :param img_folder: Path to the image. - :return: DataDict({'indices', 'images', 'images_ids','questions', 'answers', 'category_ids', 'image_sizes'}) + :return: image (Tensor), image size (Tensor, w,h, both scaled to (0,1>) """ - # Get item. - item = self.dataset[index] - # Load the adequate image. - img_id = item[self.key_image_ids] - img_folder = item["image_folder"] extension = '.jpg' # Load the image. img = Image.open(os.path.join(img_folder, img_id + extension)) @@ -665,14 +687,47 @@ def __getitem__(self, index): # Apply transformations. img = transforms_com(img) + # Get scaled image size. + img_size = torch.FloatTensor([float(height/self.scale_image_height), float(width/self.scale_image_width)]) + + # Return image and size. + return img, img_size + + def __getitem__(self, index): + """ + Getter method to access the dataset and return a single sample. + + :param index: index of the sample to return. + :type index: int + + :return: DataDict({'indices', 'images', 'images_ids','questions', 'answers', 'category_ids', 'image_sizes'}) + """ + # Get item. + item = self.dataset[index] + # Create the resulting sample (data dict). data_dict = self.create_data_dict(index) - # Image related variables. - data_dict[self.key_images] = img + # Load and stream the image ids. + img_id = item[self.key_image_ids] data_dict[self.key_image_ids] = img_id - # Scale width and height to range (0,1). - data_dict[self.key_image_sizes] = torch.FloatTensor([float(height/self.scale_image_height), float(width/self.scale_image_width)]) + + # Load the adequate image - only when required. + if self.stream_images: + + if self.preload_images: + # Use preloaded values. + img = item[self.key_images] + img_size = item[self.key_image_sizes] + else: + # Load at the very moment. + img, img_size = self.get_image(img_id, item["image_folder"]) + + # Image related variables. + data_dict[self.key_images] = img + + # Scale width and height to range (0,1). + data_dict[self.key_image_sizes] = img_size # Apply question transformations. preprocessed_question = item[self.key_questions] @@ -728,9 +783,10 @@ def collate_fn(self, batch): data_dict = self.create_data_dict([sample[self.key_indices] for sample in batch]) # Stack images. - data_dict[self.key_images] = torch.stack([item[self.key_images] for item in batch]).type(torch.FloatTensor) data_dict[self.key_image_ids] = [item[self.key_image_ids] for item in batch] - data_dict[self.key_image_sizes] = torch.stack([item[self.key_image_sizes] for item in batch]).type(torch.FloatTensor) + if self.stream_images: + data_dict[self.key_images] = torch.stack([item[self.key_images] for item in batch]).type(torch.FloatTensor) + data_dict[self.key_image_sizes] = torch.stack([item[self.key_image_sizes] for item in batch]).type(torch.FloatTensor) # Collate lists/lists of lists. data_dict[self.key_questions] = [item[self.key_questions] for item in batch]