From 566bef8110fa16b718037bdac1775487f38b5a01 Mon Sep 17 00:00:00 2001 From: tkornut Date: Fri, 26 Apr 2019 09:32:22 -0700 Subject: [PATCH 1/3] c4 answer words bow --- .../c4_classification/c4_word_answer_onehot_bow.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml index 842a987..1db17e6 100644 --- a/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml +++ b/configs/vqa_med_2019/c4_classification/c4_word_answer_onehot_bow.yml @@ -5,13 +5,11 @@ default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml training: problem: batch_size: 128 - remove_punctuation: all # Validation parameters: validation: problem: batch_size: 128 - remove_punctuation: all pipeline: name: c4_word_answer_onehot_bow @@ -51,7 +49,7 @@ pipeline: # Model. classifier: type: FeedForwardNetwork - hidden_sizes: [500] + hidden_sizes: [500, 500] dropout_rate: 0.5 priority: 3 streams: @@ -59,5 +57,11 @@ pipeline: globals: input_size: answer_words_vocabulary_size prediction_size: vocabulary_size_c4 - + + # Viewers. + viewer: + type: StreamViewer + priority: 100.4 + input_streams: answers, tokenized_answer_words, predicted_answers + #: pipeline From c314784c036bd6a4ec55f3b7fc1d69b7abe55ea1 Mon Sep 17 00:00:00 2001 From: tkornut Date: Fri, 26 Apr 2019 10:25:47 -0700 Subject: [PATCH 2/3] Added random shuffling and random removing of stop words to vqa_med problem --- .../image_text_to_class/vqa_med_2019.py | 69 ++++++++++++++++++- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/ptp/components/problems/image_text_to_class/vqa_med_2019.py b/ptp/components/problems/image_text_to_class/vqa_med_2019.py index a06624c..704b6c1 100644 --- a/ptp/components/problems/image_text_to_class/vqa_med_2019.py +++ b/ptp/components/problems/image_text_to_class/vqa_med_2019.py @@ -20,9 +20,10 @@ import os import string import tqdm + import pandas as pd from PIL import Image - +import numpy as np import nltk import torch @@ -307,6 +308,62 @@ def preprocess_text(self, text, lowercase = False, remove_punctuation = False, t # Return cleaned text. return cleansed_words + def random_remove_stop_words(self, words): + """ + Function removes random stop words, each with 0.5 probability. + + :param words: tokenized text + :return: resulting tokenized text. + """ + + # Find stop words. + stops = set(nltk.corpus.stopwords.words("english")) + stop_words = [False]*len(words) + for i, word in enumerate(words): + if word in stops: + stop_words[i] = True + #print(stop_words) + if sum(stop_words) > 0: + remove_probs = np.random.binomial(1, 0.5, len(words)) + #print(remove_probs) + result = [] + for word,is_stop,rem_prob in zip(words,stop_words,remove_probs): + if is_stop and rem_prob: + # Remove word. + continue + # Else: add word. + result.append(word) + + return result + + + def random_shuffle_words(self, words): + """ + Function randomly shuffles, with probability of 0.5, two consecutive words in text. + + :param words: tokenized text + :return: resulting tokenized text. + """ + # Do not shuffle if there are less than 2 words. + if len(words) < 2: + return words + # Shuffle with probability of 0.5. + if np.random.binomial(1, 0.5, 1): + return words + + # Find words to shuffle - random without replacement. + shuffled_i = np.random.choice(len(words)-1, ) + indices = [i for i in range(len(words))] + indices[shuffled_i] = shuffled_i+1 + indices[shuffled_i+1] = shuffled_i + #print(indices) + + # Create resulting table. + result = [words[indices[i]] for i in range(len(words))] + + return result + + def load_dataset(self, source_files, source_categories): """ Loads the dataset from one or more files. @@ -368,7 +425,6 @@ def load_dataset(self, source_files, source_categories): return dataset - def __getitem__(self, index): """ Getter method to access the dataset and return a single sample. @@ -424,7 +480,14 @@ def __getitem__(self, index): # Apply question transformations. preprocessed_question = item[self.key_questions] - # TODO: apply additional random transformations e.g. "shuffle_words" + if 'tokenize' in self.question_preprocessing: + # Apply them only if text is tokenized. + if 'random_remove_stop_words' in self.question_preprocessing: + preprocessed_question = self.random_remove_stop_words(preprocessed_question) + + if 'random_shuffle_words' in self.question_preprocessing: + preprocessed_question = self.random_shuffle_words(preprocessed_question) + # Return question. data_dict[self.key_questions] = preprocessed_question # Return answer. From c9327d2bedc34d9cf916cffaa343d2e33ca19b75 Mon Sep 17 00:00:00 2001 From: tkornut Date: Fri, 26 Apr 2019 10:45:12 -0700 Subject: [PATCH 3/3] c2 word answer one hot bow[D --- .../c2_word_answer_onehot_bow.yml | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml diff --git a/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml new file mode 100644 index 0000000..3733970 --- /dev/null +++ b/configs/vqa_med_2019/c2_classification/c2_word_answer_onehot_bow.yml @@ -0,0 +1,67 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml + +# Training parameters: +training: + problem: + batch_size: 128 + +# Validation parameters: +validation: + problem: + batch_size: 128 + +pipeline: + name: c2_word_answer_onehot_bow + + # Answer encoding. + answer_tokenizer: + type: SentenceTokenizer + priority: 1.1 + preprocessing: lowercase,remove_punctuation + remove_characters: [“,”,’] + streams: + inputs: answers + outputs: tokenized_answer_words + + answer_onehot_encoder: + type: SentenceOneHotEncoder + priority: 1.2 + data_folder: ~/data/vqa-med + word_mappings_file: answer_words.c2.preprocessed.word.mappings.csv + export_word_mappings_to_globals: True + streams: + inputs: tokenized_answer_words + outputs: encoded_answer_words + globals: + vocabulary_size: answer_words_vocabulary_size + word_mappings: answer_words_word_mappings + + answer_bow_encoder: + type: BOWEncoder + priority: 1.3 + streams: + inputs: encoded_answer_words + outputs: bow_answer_words + globals: + bow_size: answer_words_vocabulary_size + + # Model. + classifier: + type: FeedForwardNetwork + hidden_sizes: [500, 500] + dropout_rate: 0.5 + priority: 3 + streams: + inputs: bow_answer_words + globals: + input_size: answer_words_vocabulary_size + prediction_size: vocabulary_size_c2 + + # Viewers. + viewer: + type: StreamViewer + priority: 100.4 + input_streams: answers, tokenized_answer_words, predicted_answers + +#: pipeline