Skip to content
This repository was archived by the owner on Jul 18, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Load config defining problems for training, validation and testing.
default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml

# Training parameters:
training:
problem:
batch_size: 128

# Validation parameters:
validation:
problem:
batch_size: 128

pipeline:
name: c2_word_answer_onehot_bow

# Answer encoding.
answer_tokenizer:
type: SentenceTokenizer
priority: 1.1
preprocessing: lowercase,remove_punctuation
remove_characters: [“,”,’]
streams:
inputs: answers
outputs: tokenized_answer_words

answer_onehot_encoder:
type: SentenceOneHotEncoder
priority: 1.2
data_folder: ~/data/vqa-med
word_mappings_file: answer_words.c2.preprocessed.word.mappings.csv
export_word_mappings_to_globals: True
streams:
inputs: tokenized_answer_words
outputs: encoded_answer_words
globals:
vocabulary_size: answer_words_vocabulary_size
word_mappings: answer_words_word_mappings

answer_bow_encoder:
type: BOWEncoder
priority: 1.3
streams:
inputs: encoded_answer_words
outputs: bow_answer_words
globals:
bow_size: answer_words_vocabulary_size

# Model.
classifier:
type: FeedForwardNetwork
hidden_sizes: [500, 500]
dropout_rate: 0.5
priority: 3
streams:
inputs: bow_answer_words
globals:
input_size: answer_words_vocabulary_size
prediction_size: vocabulary_size_c2

# Viewers.
viewer:
type: StreamViewer
priority: 100.4
input_streams: answers, tokenized_answer_words, predicted_answers

#: pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@ default_configs: vqa_med_2019/c4_classification/default_c4_classification.yml
training:
problem:
batch_size: 128
remove_punctuation: all

# Validation parameters:
validation:
problem:
batch_size: 128
remove_punctuation: all

pipeline:
name: c4_word_answer_onehot_bow
Expand Down Expand Up @@ -51,13 +49,19 @@ pipeline:
# Model.
classifier:
type: FeedForwardNetwork
hidden_sizes: [500]
hidden_sizes: [500, 500]
dropout_rate: 0.5
priority: 3
streams:
inputs: bow_answer_words
globals:
input_size: answer_words_vocabulary_size
prediction_size: vocabulary_size_c4


# Viewers.
viewer:
type: StreamViewer
priority: 100.4
input_streams: answers, tokenized_answer_words, predicted_answers

#: pipeline
69 changes: 66 additions & 3 deletions ptp/components/problems/image_text_to_class/vqa_med_2019.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import os
import string
import tqdm

import pandas as pd
from PIL import Image

import numpy as np
import nltk

import torch
Expand Down Expand Up @@ -307,6 +308,62 @@ def preprocess_text(self, text, lowercase = False, remove_punctuation = False, t
# Return cleaned text.
return cleansed_words

def random_remove_stop_words(self, words):
"""
Function removes random stop words, each with 0.5 probability.

:param words: tokenized text
:return: resulting tokenized text.
"""

# Find stop words.
stops = set(nltk.corpus.stopwords.words("english"))
stop_words = [False]*len(words)
for i, word in enumerate(words):
if word in stops:
stop_words[i] = True
#print(stop_words)
if sum(stop_words) > 0:
remove_probs = np.random.binomial(1, 0.5, len(words))
#print(remove_probs)
result = []
for word,is_stop,rem_prob in zip(words,stop_words,remove_probs):
if is_stop and rem_prob:
# Remove word.
continue
# Else: add word.
result.append(word)

return result


def random_shuffle_words(self, words):
"""
Function randomly shuffles, with probability of 0.5, two consecutive words in text.

:param words: tokenized text
:return: resulting tokenized text.
"""
# Do not shuffle if there are less than 2 words.
if len(words) < 2:
return words
# Shuffle with probability of 0.5.
if np.random.binomial(1, 0.5, 1):
return words

# Find words to shuffle - random without replacement.
shuffled_i = np.random.choice(len(words)-1, )
indices = [i for i in range(len(words))]
indices[shuffled_i] = shuffled_i+1
indices[shuffled_i+1] = shuffled_i
#print(indices)

# Create resulting table.
result = [words[indices[i]] for i in range(len(words))]

return result


def load_dataset(self, source_files, source_categories):
"""
Loads the dataset from one or more files.
Expand Down Expand Up @@ -368,7 +425,6 @@ def load_dataset(self, source_files, source_categories):
return dataset



def __getitem__(self, index):
"""
Getter method to access the dataset and return a single sample.
Expand Down Expand Up @@ -424,7 +480,14 @@ def __getitem__(self, index):

# Apply question transformations.
preprocessed_question = item[self.key_questions]
# TODO: apply additional random transformations e.g. "shuffle_words"
if 'tokenize' in self.question_preprocessing:
# Apply them only if text is tokenized.
if 'random_remove_stop_words' in self.question_preprocessing:
preprocessed_question = self.random_remove_stop_words(preprocessed_question)

if 'random_shuffle_words' in self.question_preprocessing:
preprocessed_question = self.random_shuffle_words(preprocessed_question)
# Return question.
data_dict[self.key_questions] = preprocessed_question

# Return answer.
Expand Down