From 45624d4d6a92d5d30281df2d04e028cffa1c8179 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 11:03:02 -0700
Subject: [PATCH 1/2] Added stream_images flag to VQAMED problem, enabling to
 turn off/on (default:on) streaming of images and image_sizes

---
 .../image_text_to_class/vqa_med_2019.yml      |  3 +
 .../c4_word_answer_onehot_bow.yml             | 23 ++++-
 .../image_text_to_class/vqa_med_2019.py       | 93 +++++++++++--------
 3 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml
index 3846b45..b29f344 100644
--- a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml
+++ b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml
@@ -15,6 +15,9 @@ split: training
 # Options: all | c1 | c2 | c3 | c4 (or any combination of the latter 4)
 categories: all
 
+# Flag indicating whether the problem will load and return images (LOADED)
+stream_images: True
+
 # Resize parameter (LOADED)
 # When present, resizes the images from original size to [height, width]
 # Depth remains set to 3.
diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
index 5ee97e7..17168c2 100644
--- a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
@@ -1,9 +1,28 @@
 # Load config defining problems for training, validation and testing.
 default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
 
-pipeline:
-  name: c4_word_answer_onehot_bow
+# Training parameters:
+training:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+    
+# Validation parameters:
+validation:
+  problem:
+    categories: C4
+    batch_size: 512
+    # In here we won't use images at all.
+    stream_images: False
+  dataloader:
+    num_workers: 0
+
 
+pipeline:
   # Answer encoding.
   answer_tokenizer:
     type: SentenceTokenizer
diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
index e156eba..5d4d5a5 100644
--- a/ptp/components/problems/image_text_to_class/vqa_med_2019.py
+++ b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
@@ -85,6 +85,9 @@ def __init__(self, name, config):
         self.key_category_names = self.stream_keys["category_names"]
         self.key_image_sizes = self.stream_keys["image_sizes"]
 
+        # Get flag informing whether we want to stream images or not.
+        self.stream_images = self.config['stream_images']
+        
         # Check the desired image size.
         if len(self.config['resize_image']) != 2:
             self.logger.error("'resize_image' field must contain 2 values: the desired height and width")
@@ -275,12 +278,15 @@ def output_data_definitions(self):
         # Add all "standard" streams.
         d = {
             self.key_indices: DataDefinition([-1, 1], [list, int], "Batch of sample indices [BATCH_SIZE] x [1]"),
-            self.key_images: DataDefinition([-1, self.depth, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]"),
             self.key_image_ids: DataDefinition([-1, 1], [list, str], "Batch of image names, each being a single word [BATCH_SIZE] x [STRING]"),
-            self.key_image_sizes: DataDefinition([-1, 2], [torch.Tensor], "Batch of original sizes (height, width) of images [BATCH_SIZE x 2]"),
             self.key_category_ids: DataDefinition([-1], [torch.Tensor], "Batch of target category indices, each being a single index [BATCH_SIZE]"),
             self.key_category_names: DataDefinition([-1, 1], [list, str], "Batch of category target names, each being a single word [BATCH_SIZE] x [STRING]"),
             }
+        
+        # Return images only when required.
+        if self.stream_images:
+            d[self.key_images] = DataDefinition([-1, self.depth, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]")
+            d[self.key_image_sizes] = DataDefinition([-1, 2], [torch.Tensor], "Batch of original sizes (height, width) of images [BATCH_SIZE x 2]")
 
         # Add stream with questions.
         if 'tokenize' in self.question_preprocessing:
@@ -632,47 +638,51 @@ def __getitem__(self, index):
         # Get item.
         item = self.dataset[index]
 
-        # Load the adequate image.
-        img_id = item[self.key_image_ids]
-        img_folder = item["image_folder"]
-        extension = '.jpg'
-        # Load the image.
-        img = Image.open(os.path.join(img_folder, img_id + extension))
-        # Get its width and height.
-        width, height = img.size
-
-        image_transformations_list = []
-        # Optional.
-        if 'random_affine' in self.image_preprocessing:
-            rotate = (-45, 80)
-            translate = (0.05, 0.25)
-            scale = (0.5, 2)
-            image_transformations_list.append(transforms.RandomAffine(rotate, translate, scale))
-        if 'random_horizontal_flip' in self.image_preprocessing:
-            image_transformations_list.append(transforms.RandomHorizontalFlip())
-            
-        # Add two obligatory transformations.
-        image_transformations_list.append(transforms.Resize([self.height,self.width]))
-        image_transformations_list.append(transforms.ToTensor())
-
-        # Optional normalizastion.
-        if 'normalize' in self.image_preprocessing:
-            # Use normalization that the pretrained models from TorchVision require.
-            image_transformations_list.append(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
-
-        # Resize the image and transform to Torch Tensor.
-        transforms_com = transforms.Compose(image_transformations_list)
-        # Apply transformations.
-        img = transforms_com(img)
-
         # Create the resulting sample (data dict).
         data_dict = self.create_data_dict(index)
 
-        # Image related variables.
-        data_dict[self.key_images] = img
+        # Load and stream the image ids.
+        img_id = item[self.key_image_ids]
         data_dict[self.key_image_ids] = img_id
-        # Scale width and height to range (0,1).
-        data_dict[self.key_image_sizes] = torch.FloatTensor([float(height/self.scale_image_height), float(width/self.scale_image_width)])
+
+        # Load the adequate image - only when required.
+        if self.stream_images:
+            img_folder = item["image_folder"]
+            extension = '.jpg'
+            # Load the image.
+            img = Image.open(os.path.join(img_folder, img_id + extension))
+            # Get its width and height.
+            width, height = img.size
+
+            image_transformations_list = []
+            # Optional.
+            if 'random_affine' in self.image_preprocessing:
+                rotate = (-45, 80)
+                translate = (0.05, 0.25)
+                scale = (0.5, 2)
+                image_transformations_list.append(transforms.RandomAffine(rotate, translate, scale))
+            if 'random_horizontal_flip' in self.image_preprocessing:
+                image_transformations_list.append(transforms.RandomHorizontalFlip())
+                
+            # Add two obligatory transformations.
+            image_transformations_list.append(transforms.Resize([self.height,self.width]))
+            image_transformations_list.append(transforms.ToTensor())
+
+            # Optional normalizastion.
+            if 'normalize' in self.image_preprocessing:
+                # Use normalization that the pretrained models from TorchVision require.
+                image_transformations_list.append(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+
+            # Resize the image and transform to Torch Tensor.
+            transforms_com = transforms.Compose(image_transformations_list)
+            # Apply transformations.
+            img = transforms_com(img)
+
+            # Image related variables.
+            data_dict[self.key_images] = img
+
+            # Scale width and height to range (0,1).
+            data_dict[self.key_image_sizes] = torch.FloatTensor([float(height/self.scale_image_height), float(width/self.scale_image_width)])
 
         # Apply question transformations.
         preprocessed_question = item[self.key_questions]
@@ -728,9 +738,10 @@ def collate_fn(self, batch):
         data_dict = self.create_data_dict([sample[self.key_indices] for sample in batch])
 
         # Stack images.
-        data_dict[self.key_images] = torch.stack([item[self.key_images] for item in batch]).type(torch.FloatTensor)
         data_dict[self.key_image_ids] = [item[self.key_image_ids] for item in batch]
-        data_dict[self.key_image_sizes] = torch.stack([item[self.key_image_sizes] for item in batch]).type(torch.FloatTensor)
+        if self.stream_images:
+            data_dict[self.key_images] = torch.stack([item[self.key_images] for item in batch]).type(torch.FloatTensor)
+            data_dict[self.key_image_sizes] = torch.stack([item[self.key_image_sizes] for item in batch]).type(torch.FloatTensor)
 
         # Collate lists/lists of lists.
         data_dict[self.key_questions] = [item[self.key_questions] for item in batch]

From 9f9196fca87c8bf660a0876d0d0c6a06143610a5 Mon Sep 17 00:00:00 2001
From: Tomasz Kornuta <tkornut@us.ibm.com>
Date: Sat, 4 May 2019 12:24:22 -0700
Subject: [PATCH 2/2] Added option to (pre)load the whole dataset (images
 mostly) to memory during problem initialization

---
 .../image_text_to_class/vqa_med_2019.yml      |   5 +
 ...ic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml |  14 +-
 .../image_text_to_class/vqa_med_2019.py       | 121 ++++++++++++------
 3 files changed, 101 insertions(+), 39 deletions(-)

diff --git a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml
index b29f344..715cb17 100644
--- a/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml
+++ b/configs/default/components/problems/image_text_to_class/vqa_med_2019.yml
@@ -18,6 +18,11 @@ categories: all
 # Flag indicating whether the problem will load and return images (LOADED)
 stream_images: True
 
+# Flag indicating whether images will be preloaded (i.e. loaded once at start) (LOADED)
+# WARNING: if this option is active, the images will also be "preprocessed" at start.
+# This means that preloading should not be used when one needs to use the random augmentations!
+preload_images: False
+
 # Resize parameter (LOADED)
 # When present, resizes the images from original size to [height, width]
 # Depth remains set to 3.
diff --git a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
index b3b7576..958161a 100644
--- a/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
+++ b/configs/vqa_med_2019/evaluation/mimic_lstm_vgg16_ewm_is_cat_ffn_c123_loss.yml
@@ -38,7 +38,9 @@ hyperparameters:
   # Final classifier: FFN.
   answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [83]
 
-  batch_size: &batch_size 64
+  batch_size: &batch_size 200
+  preload_images: &preload_images True
+  num_workers: &num_workers 0
 
 # Training parameters:
 training:
@@ -49,10 +51,15 @@ training:
     # Appy all preprocessing/data augmentations.
     question_preprocessing: *question_preprocessing
     image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
     streams: 
       questions: tokenized_questions
   sampler:
     weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
+  # Use four workers for loading images.
+  dataloader:
+    num_workers: *num_workers
 
   # Optimizer parameters:
   optimizer:
@@ -67,14 +74,19 @@ training:
 
 # Validation parameters:
 validation:
+  partial_validation_interval: 100
   problem:
     batch_size: *batch_size
     categories: C1,C2,C3
     # Appy all preprocessing/data augmentations.
     question_preprocessing: *question_preprocessing
     image_preprocessing: *image_preprocessing 
+    # Preload images.
+    preload_images: *preload_images
     streams: 
       questions: tokenized_questions
+  dataloader:
+    num_workers: *num_workers
 
 
 pipeline:
diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
index 5d4d5a5..8e94418 100644
--- a/ptp/components/problems/image_text_to_class/vqa_med_2019.py
+++ b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
@@ -87,7 +87,10 @@ def __init__(self, name, config):
 
         # Get flag informing whether we want to stream images or not.
         self.stream_images = self.config['stream_images']
-        
+
+        # Get flag indicating whether we want to (pre)aload all images at the start.
+        self.preload_images = self.config['preload_images']
+
         # Check the desired image size.
         if len(self.config['resize_image']) != 2:
             self.logger.error("'resize_image' field must contain 2 values: the desired height and width")
@@ -547,8 +550,8 @@ def load_dataset(self, source_files, source_image_folders, source_categories):
                     False
                     )
 
-                # Add record to dataset.
-                dataset.append({
+                # Create item "dictionary".
+                item = {
                     # Image name and path leading to it.
                     self.key_image_ids: row[self.key_image_ids],
                     "image_folder": image_folder,
@@ -556,7 +559,16 @@ def load_dataset(self, source_files, source_image_folders, source_categories):
                     self.key_answers: preprocessed_answer,
                     # Add category.
                     self.key_category_ids: category
-                    })
+                    }
+
+                # Preload image.
+                if self.preload_images and self.stream_images:
+                    img, img_size = self.get_image(row[self.key_image_ids], image_folder)
+                    item[self.key_images] = img
+                    item[self.key_image_sizes] = img_size
+
+                # Add item to dataset.
+                dataset.append(item)
 
                 t.update()
             t.close()
@@ -607,8 +619,8 @@ def load_testset(self, data_file, image_folder):
             else:
                 preprocessed_answer = answer 
 
-            # Add record to dataset.
-            dataset.append({
+            # Create item "dictionary".
+            item = {
                 # Image name and path leading to it.
                 self.key_image_ids: row[self.key_image_ids],
                 "image_folder": image_folder,
@@ -616,7 +628,16 @@ def load_testset(self, data_file, image_folder):
                 self.key_answers: preprocessed_answer,
                 # Add category.
                 self.key_category_ids: category_id
-                })
+                }
+
+            # Preload image.
+            if self.preload_images and self.stream_images:
+                img, img_size = self.get_image(row[self.key_image_ids], image_folder)
+                item[self.key_images] = img
+                item[self.key_image_sizes] = img_size
+
+            # Add item to dataset.
+            dataset.append(item)
 
             t.update()
         t.close()
@@ -625,6 +646,52 @@ def load_testset(self, data_file, image_folder):
         # Return the created list.
         return dataset
 
+    def get_image(self, img_id, img_folder):
+        """
+        Function loads and returns image along with its size.
+        Additionally, it performs all the required transformations.
+
+        :param img_id: Identifier of the images.
+        :param img_folder: Path to the image.
+
+        :return: image (Tensor), image size (Tensor, w,h, both scaled to (0,1>)
+        """
+
+        extension = '.jpg'
+        # Load the image.
+        img = Image.open(os.path.join(img_folder, img_id + extension))
+        # Get its width and height.
+        width, height = img.size
+
+        image_transformations_list = []
+        # Optional.
+        if 'random_affine' in self.image_preprocessing:
+            rotate = (-45, 80)
+            translate = (0.05, 0.25)
+            scale = (0.5, 2)
+            image_transformations_list.append(transforms.RandomAffine(rotate, translate, scale))
+        if 'random_horizontal_flip' in self.image_preprocessing:
+            image_transformations_list.append(transforms.RandomHorizontalFlip())
+            
+        # Add two obligatory transformations.
+        image_transformations_list.append(transforms.Resize([self.height,self.width]))
+        image_transformations_list.append(transforms.ToTensor())
+
+        # Optional normalizastion.
+        if 'normalize' in self.image_preprocessing:
+            # Use normalization that the pretrained models from TorchVision require.
+            image_transformations_list.append(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+
+        # Resize the image and transform to Torch Tensor.
+        transforms_com = transforms.Compose(image_transformations_list)
+        # Apply transformations.
+        img = transforms_com(img)
+
+        # Get scaled image size.
+        img_size = torch.FloatTensor([float(height/self.scale_image_height), float(width/self.scale_image_width)])
+
+        # Return image and size.
+        return img, img_size
 
     def __getitem__(self, index):
         """
@@ -647,42 +714,20 @@ def __getitem__(self, index):
 
         # Load the adequate image - only when required.
         if self.stream_images:
-            img_folder = item["image_folder"]
-            extension = '.jpg'
-            # Load the image.
-            img = Image.open(os.path.join(img_folder, img_id + extension))
-            # Get its width and height.
-            width, height = img.size
-
-            image_transformations_list = []
-            # Optional.
-            if 'random_affine' in self.image_preprocessing:
-                rotate = (-45, 80)
-                translate = (0.05, 0.25)
-                scale = (0.5, 2)
-                image_transformations_list.append(transforms.RandomAffine(rotate, translate, scale))
-            if 'random_horizontal_flip' in self.image_preprocessing:
-                image_transformations_list.append(transforms.RandomHorizontalFlip())
-                
-            # Add two obligatory transformations.
-            image_transformations_list.append(transforms.Resize([self.height,self.width]))
-            image_transformations_list.append(transforms.ToTensor())
-
-            # Optional normalizastion.
-            if 'normalize' in self.image_preprocessing:
-                # Use normalization that the pretrained models from TorchVision require.
-                image_transformations_list.append(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
-
-            # Resize the image and transform to Torch Tensor.
-            transforms_com = transforms.Compose(image_transformations_list)
-            # Apply transformations.
-            img = transforms_com(img)
+
+            if self.preload_images:
+                # Use preloaded values.
+                img = item[self.key_images]             
+                img_size = item[self.key_image_sizes]             
+            else:
+                # Load at the very moment.
+                img, img_size = self.get_image(img_id, item["image_folder"])
 
             # Image related variables.
             data_dict[self.key_images] = img
 
             # Scale width and height to range (0,1).
-            data_dict[self.key_image_sizes] = torch.FloatTensor([float(height/self.scale_image_height), float(width/self.scale_image_width)])
+            data_dict[self.key_image_sizes] = img_size
 
         # Apply question transformations.
         preprocessed_question = item[self.key_questions]