Merge pull request #20 from IBM/c4_pipelines

tkornuta-ibm · web-flow · commit a25163048c20 · 2019-04-26T11:52:08.000-07:00
vqa_med: question augmentations: random word shuffling and random remove words
diff --git a/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml
@@ -0,0 +1,67 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    batch_size: 128
+
+# Validation parameters:
+validation:
+  problem:
+    batch_size: 128
+
+pipeline:
+  name: c2_word_answer_onehot_bow
+
+  # Answer encoding.
+  answer_tokenizer:
+    type: SentenceTokenizer
+    priority: 1.1
+    preprocessing: lowercase,remove_punctuation
+    remove_characters: [“,”,’]
+    streams: 
+      inputs: answers
+      outputs: tokenized_answer_words
+
+  answer_onehot_encoder:
+    type: SentenceOneHotEncoder
+    priority: 1.2
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answer_words.c2.preprocessed.word.mappings.csv
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: tokenized_answer_words
+      outputs: encoded_answer_words
+    globals:
+      vocabulary_size: answer_words_vocabulary_size
+      word_mappings: answer_words_word_mappings
+
+  answer_bow_encoder:
+    type: BOWEncoder
+    priority: 1.3
+    streams:
+      inputs: encoded_answer_words
+      outputs: bow_answer_words
+    globals:
+        bow_size: answer_words_vocabulary_size
+
+  # Model.
+  classifier:
+    type: FeedForwardNetwork 
+    hidden_sizes: [500, 500]
+    dropout_rate: 0.5
+    priority: 3
+    streams:
+      inputs: bow_answer_words
+    globals:
+      input_size: answer_words_vocabulary_size
+      prediction_size: vocabulary_size_c2
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
+#: pipeline
diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
@@ -5,13 +5,11 @@ default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
 training:
   problem:
     batch_size: 128
-    remove_punctuation: all
 
 # Validation parameters:
 validation:
   problem:
     batch_size: 128
-    remove_punctuation: all
 
 pipeline:
   name: c4_word_answer_onehot_bow
@@ -51,13 +49,19 @@ pipeline:
   # Model.
   classifier:
     type: FeedForwardNetwork 
-    hidden_sizes: [500]
+    hidden_sizes: [500, 500]
     dropout_rate: 0.5
     priority: 3
     streams:
       inputs: bow_answer_words
     globals:
       input_size: answer_words_vocabulary_size
       prediction_size: vocabulary_size_c4
-  
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
 #: pipeline
diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
@@ -20,9 +20,10 @@
 import os
 import string
 import tqdm
+
 import pandas as pd
 from PIL import Image
-
+import numpy as np
 import nltk
 
 import torch
@@ -307,6 +308,62 @@ def preprocess_text(self, text, lowercase = False, remove_punctuation = False, t
         # Return cleaned text.
         return cleansed_words
 
+    def random_remove_stop_words(self, words):
+        """
+        Function removes random stop words, each with 0.5 probability.
+        
+        :param words: tokenized text
+        :return: resulting tokenized text.
+        """
+
+        # Find stop words.
+        stops = set(nltk.corpus.stopwords.words("english"))
+        stop_words = [False]*len(words)
+        for i, word in enumerate(words):
+            if word in stops:
+                stop_words[i] = True
+        #print(stop_words)
+        if sum(stop_words) > 0:
+            remove_probs = np.random.binomial(1, 0.5, len(words))
+            #print(remove_probs)
+            result = []
+            for word,is_stop,rem_prob in zip(words,stop_words,remove_probs):
+                if is_stop and rem_prob:
+                    # Remove word.
+                    continue
+                # Else: add word.
+                result.append(word)
+
+        return result
+
+
+    def random_shuffle_words(self, words):
+        """
+        Function randomly shuffles, with probability of 0.5, two consecutive words in text.
+        
+        :param words: tokenized text
+        :return: resulting tokenized text.
+        """
+        # Do not shuffle if there are less than 2 words.
+        if len(words) < 2:
+            return words
+        # Shuffle with probability of 0.5.
+        if np.random.binomial(1, 0.5, 1):
+            return words
+        
+        # Find words to shuffle - random without replacement.
+        shuffled_i = np.random.choice(len(words)-1, )
+        indices = [i for i in range(len(words))]
+        indices[shuffled_i] = shuffled_i+1
+        indices[shuffled_i+1] = shuffled_i
+        #print(indices)
+        
+        # Create resulting table.
+        result = [words[indices[i]] for i in range(len(words))]
+
+        return result
+
+
     def load_dataset(self, source_files, source_categories):
         """
         Loads the dataset from one or more files.
@@ -368,7 +425,6 @@ def load_dataset(self, source_files, source_categories):
         return dataset
 
 
-
     def __getitem__(self, index):
         """
         Getter method to access the dataset and return a single sample.
@@ -424,7 +480,14 @@ def __getitem__(self, index):
 
         # Apply question transformations.
         preprocessed_question = item[self.key_questions]
-        # TODO: apply additional random transformations e.g. "shuffle_words"
+        if 'tokenize' in self.question_preprocessing:
+            # Apply them only if text is tokenized.
+            if 'random_remove_stop_words' in self.question_preprocessing:
+                preprocessed_question = self.random_remove_stop_words(preprocessed_question)
+
+            if 'random_shuffle_words' in self.question_preprocessing:
+                preprocessed_question = self.random_shuffle_words(preprocessed_question)
+        # Return question.
         data_dict[self.key_questions] = preprocessed_question
 
         # Return answer.