From 566bef8110fa16b718037bdac1775487f38b5a01 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Fri, 26 Apr 2019 09:32:22 -0700
Subject: [PATCH 1/3] c4 answer words bow

---
 .../c4_classification/c4_word_answer_onehot_bow.yml  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
index 842a987..1db17e6 100644
--- a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
+++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml
@@ -5,13 +5,11 @@ default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
 training:
   problem:
     batch_size: 128
-    remove_punctuation: all
 
 # Validation parameters:
 validation:
   problem:
     batch_size: 128
-    remove_punctuation: all
 
 pipeline:
   name: c4_word_answer_onehot_bow
@@ -51,7 +49,7 @@ pipeline:
   # Model.
   classifier:
     type: FeedForwardNetwork 
-    hidden_sizes: [500]
+    hidden_sizes: [500, 500]
     dropout_rate: 0.5
     priority: 3
     streams:
@@ -59,5 +57,11 @@ pipeline:
     globals:
       input_size: answer_words_vocabulary_size
       prediction_size: vocabulary_size_c4
-  
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
 #: pipeline

From c314784c036bd6a4ec55f3b7fc1d69b7abe55ea1 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Fri, 26 Apr 2019 10:25:47 -0700
Subject: [PATCH 2/3] Added random shuffling and random removing of stop words
 to vqa_med problem

---
 .../image_text_to_class/vqa_med_2019.py       | 69 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
index a06624c..704b6c1 100644
--- a/ptp/components/problems/image_text_to_class/vqa_med_2019.py
+++ b/ptp/components/problems/image_text_to_class/vqa_med_2019.py
@@ -20,9 +20,10 @@
 import os
 import string
 import tqdm
+
 import pandas as pd
 from PIL import Image
-
+import numpy as np
 import nltk
 
 import torch
@@ -307,6 +308,62 @@ def preprocess_text(self, text, lowercase = False, remove_punctuation = False, t
         # Return cleaned text.
         return cleansed_words
 
+    def random_remove_stop_words(self, words):
+        """
+        Function removes random stop words, each with 0.5 probability.
+        
+        :param words: tokenized text
+        :return: resulting tokenized text.
+        """
+
+        # Find stop words.
+        stops = set(nltk.corpus.stopwords.words("english"))
+        stop_words = [False]*len(words)
+        for i, word in enumerate(words):
+            if word in stops:
+                stop_words[i] = True
+        #print(stop_words)
+        if sum(stop_words) > 0:
+            remove_probs = np.random.binomial(1, 0.5, len(words))
+            #print(remove_probs)
+            result = []
+            for word,is_stop,rem_prob in zip(words,stop_words,remove_probs):
+                if is_stop and rem_prob:
+                    # Remove word.
+                    continue
+                # Else: add word.
+                result.append(word)
+
+        return result
+
+
+    def random_shuffle_words(self, words):
+        """
+        Function randomly shuffles, with probability of 0.5, two consecutive words in text.
+        
+        :param words: tokenized text
+        :return: resulting tokenized text.
+        """
+        # Do not shuffle if there are less than 2 words.
+        if len(words) < 2:
+            return words
+        # Shuffle with probability of 0.5.
+        if np.random.binomial(1, 0.5, 1):
+            return words
+        
+        # Find words to shuffle - random without replacement.
+        shuffled_i = np.random.choice(len(words)-1, )
+        indices = [i for i in range(len(words))]
+        indices[shuffled_i] = shuffled_i+1
+        indices[shuffled_i+1] = shuffled_i
+        #print(indices)
+        
+        # Create resulting table.
+        result = [words[indices[i]] for i in range(len(words))]
+
+        return result
+
+
     def load_dataset(self, source_files, source_categories):
         """
         Loads the dataset from one or more files.
@@ -368,7 +425,6 @@ def load_dataset(self, source_files, source_categories):
         return dataset
 
 
-
     def __getitem__(self, index):
         """
         Getter method to access the dataset and return a single sample.
@@ -424,7 +480,14 @@ def __getitem__(self, index):
 
         # Apply question transformations.
         preprocessed_question = item[self.key_questions]
-        # TODO: apply additional random transformations e.g. "shuffle_words"
+        if 'tokenize' in self.question_preprocessing:
+            # Apply them only if text is tokenized.
+            if 'random_remove_stop_words' in self.question_preprocessing:
+                preprocessed_question = self.random_remove_stop_words(preprocessed_question)
+
+            if 'random_shuffle_words' in self.question_preprocessing:
+                preprocessed_question = self.random_shuffle_words(preprocessed_question)
+        # Return question.
         data_dict[self.key_questions] = preprocessed_question
 
         # Return answer. 

From c9327d2bedc34d9cf916cffaa343d2e33ca19b75 Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Fri, 26 Apr 2019 10:45:12 -0700
Subject: [PATCH 3/3] c2 word answer one hot bow[D

---
 .../c2_word_answer_onehot_bow.yml             | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml

diff --git a/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml
new file mode 100644
index 0000000..3733970
--- /dev/null
+++ b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml
@@ -0,0 +1,67 @@
+# Load config defining problems for training, validation and testing.
+default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
+
+# Training parameters:
+training:
+  problem:
+    batch_size: 128
+
+# Validation parameters:
+validation:
+  problem:
+    batch_size: 128
+
+pipeline:
+  name: c2_word_answer_onehot_bow
+
+  # Answer encoding.
+  answer_tokenizer:
+    type: SentenceTokenizer
+    priority: 1.1
+    preprocessing: lowercase,remove_punctuation
+    remove_characters: [“,”,’]
+    streams: 
+      inputs: answers
+      outputs: tokenized_answer_words
+
+  answer_onehot_encoder:
+    type: SentenceOneHotEncoder
+    priority: 1.2
+    data_folder: ~/data/vqa-med
+    word_mappings_file: answer_words.c2.preprocessed.word.mappings.csv
+    export_word_mappings_to_globals: True
+    streams:
+      inputs: tokenized_answer_words
+      outputs: encoded_answer_words
+    globals:
+      vocabulary_size: answer_words_vocabulary_size
+      word_mappings: answer_words_word_mappings
+
+  answer_bow_encoder:
+    type: BOWEncoder
+    priority: 1.3
+    streams:
+      inputs: encoded_answer_words
+      outputs: bow_answer_words
+    globals:
+        bow_size: answer_words_vocabulary_size
+
+  # Model.
+  classifier:
+    type: FeedForwardNetwork 
+    hidden_sizes: [500, 500]
+    dropout_rate: 0.5
+    priority: 3
+    streams:
+      inputs: bow_answer_words
+    globals:
+      input_size: answer_words_vocabulary_size
+      prediction_size: vocabulary_size_c2
+
+   # Viewers.
+  viewer:
+    type: StreamViewer
+    priority: 100.4
+    input_streams: answers, tokenized_answer_words, predicted_answers
+ 
+#: pipeline