Skip to content
This repository was archived by the owner on Jul 18, 2024. It is now read-only.
Merged
34 changes: 34 additions & 0 deletions configs/default/components/publishers/stream_file_exporter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This file defines the default values for the Stream File Exporter.

####################################################################
# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
####################################################################

# List of names of streams that will be displayed (LOADED)
# Can be string a single name or or comma separated string with list
input_streams: ''

# Separator that will be placed between values (LOADED)
separator: ','

# Name of the file containing output values (LOADED)
filename: 'outputs.txt'

streams:
####################################################################
# 2. Keymappings associated with INPUT and OUTPUT streams.
####################################################################

globals:
####################################################################
# 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
####################################################################

####################################################################
# 4. Keymappings associated with GLOBAL variables that will be SET.
####################################################################

####################################################################
# 5. Keymappings associated with statistics that will be ADDED.
####################################################################

62 changes: 62 additions & 0 deletions configs/vqa_med_2019/default_extend_answers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# This config is not a standalone config!
# It adds new sections (sets) without samplers and components for saving answers that we can use for getting final answers.

training_answers:
problem:
type: &p_type VQAMED2019
data_folder: &data_folder ~/data/vqa-med
split: training
categories: all
resize_image: &resize_image [224, 224]
batch_size: 64
dataloader:
# No sampler, process samples in the same order.
shuffle: false
# Use 1 worker, so batches will follow the samples order.
num_workers: 1

validation_answers:
problem:
type: *p_type
data_folder: *data_folder
split: validation
resize_image: *resize_image
batch_size: 64
dataloader:
# No sampler, process samples in the same order.
shuffle: false
# Use 1 worker, so batches will follow the samples order.
num_workers: 1


# Testing parameters:
test_answers:
problem:
type: *p_type
data_folder: *data_folder
split: test
resize_image: *resize_image
batch_size: 64
dataloader:
# No sampler, process samples in the same order.
shuffle: false
# Use 1 worker, so batches will follow the samples order.
num_workers: 1

# Add component for exporting answers to files.
pipeline:
disable: viewer
# # Viewers.
viewer_extended:
priority: 100.4
type: StreamViewer
sample_number: 0
input_streams: indices,image_ids,questions,category_names,predicted_categories,answers,tokenized_answers,predicted_answers

exporter:
priority: 100.5
type: StreamFileExporter
separator: '|'
input_streams: indices,image_ids,questions,category_names,predicted_categories,answers,tokenized_answers,predicted_answers

#: pipeline
11 changes: 0 additions & 11 deletions configs/vqa_med_2019/default_vqa_med_2019.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,3 @@ validation:
# Use four workers for loading images.
dataloader:
num_workers: 4


# Testing parameters:
testing:
problem:
type: *p_type
data_folder: *data_folder
split: test
resize_image: *resize_image
batch_size: 32

Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@
default_configs: vqa_med_2019/default_vqa_med_2019.yml

training:
problem:
categories: all
export_sample_weights: ~/data/vqa-med/answers.all.weights.csv
sampler:
weights: ~/data/vqa-med/answers.all.weights.csv

# settings parameters
terminal_conditions:
loss_stop: 1.0e-3

validation:
problem:
categories: all


pipeline:

# Predictions decoder.
Expand Down
162 changes: 118 additions & 44 deletions ptp/components/problems/image_text_to_class/vqa_med_2019.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from ptp.data_types.data_definition import DataDefinition

from ptp.components.utils.io import save_nparray_to_csv_file
from ptp.configuration.config_parsing import get_value_list_from_dictionary
from ptp.configuration.config_parsing import get_value_list_from_dictionary, get_value_from_dictionary


class VQAMED2019(Problem):
Expand Down Expand Up @@ -109,11 +109,49 @@ def __init__(self, name, config):
self.globals["category_word_mappings"] = {'C1': 0, 'C2': 1, 'C3': 2, 'C4': 3, 'BINARY': 4, '<UNK>': 5}
self.category_idx_to_word = {0: 'C1', 1: 'C2', 2: 'C3', 3: 'C4', 4: 'BINARY', 5: '<UNK>'}

# Get image preprocessing.
self.image_preprocessing = get_value_list_from_dictionary(
"image_preprocessing", self.config,
'none | random_affine | random_horizontal_flip | normalize | all'.split(" | ")
)
if 'none' in self.image_preprocessing:
self.image_preprocessing = []
if 'all' in self.image_preprocessing:
self.image_preprocessing = 'random_affine | random_horizontal_flip | normalize'.split(" | ")
self.logger.info("Applied image preprocessing: {}".format(self.image_preprocessing))


# Get question preprocessing.
self.question_preprocessing = get_value_list_from_dictionary(
"question_preprocessing", self.config,
'none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all'.split(" | ")
)
if 'none' in self.question_preprocessing:
self.question_preprocessing = []
if 'all' in self.question_preprocessing:
self.question_preprocessing = 'lowercase | remove_punctuation | tokenize | remove_stop_words | shuffle_words'.split(" | ")
self.logger.info("Applied question preprocessing: {}".format(self.question_preprocessing))

# Get answer preprocessing.
self.answer_preprocessing = get_value_list_from_dictionary(
"answer_preprocessing", self.config,
'none | lowercase | remove_punctuation | tokenize | all'.split(" | ")
)
if 'none' in self.answer_preprocessing:
self.answer_preprocessing = []
if 'all' in self.answer_preprocessing:
self.answer_preprocessing = 'lowercase | remove_punctuation | tokenize '.split(" | ")
self.logger.info("Applied answer preprocessing: {}".format(self.answer_preprocessing))


# Get the absolute path.
self.data_folder = os.path.expanduser(self.config['data_folder'])

# Get split.
split = get_value_from_dictionary('split', self.config, "training,validation,training_validation,test".split(","))

# Set split-dependent data.
if self.config['split'] == 'training':
if split == 'training':
# Training split folder.
split_folder = os.path.join(self.data_folder, "ImageClef-2019-VQA-Med-Training")
# Set source files.
Expand All @@ -131,8 +169,10 @@ def __init__(self, name, config):

# Filter lists taking into account configuration.
source_files, source_image_folders, source_categories = self.filter_sources(source_files, source_image_folders, source_categories)
# Load dataset.
self.dataset = self.load_dataset(source_files, source_image_folders, source_categories)

elif self.config['split'] == 'validation':
elif split == 'validation':
# Validation split folder.
split_folder = os.path.join(self.data_folder, "ImageClef-2019-VQA-Med-Validation")

Expand All @@ -152,8 +192,10 @@ def __init__(self, name, config):

# Filter lists taking into account configuration.
source_files, source_image_folders, source_categories = self.filter_sources(source_files, source_image_folders, source_categories)
# Load dataset.
self.dataset = self.load_dataset(source_files, source_image_folders, source_categories)

elif self.config['split'] == 'training_validation':
elif split == 'training_validation':
# This split takes both training and validation and assumes utilization of kFoldWeightedRandomSampler.

# 1. Training split folder.
Expand Down Expand Up @@ -198,47 +240,17 @@ def __init__(self, name, config):
source_files = [*training_source_files, *valid_source_files]
source_image_folders = [*training_source_image_folders, *valid_source_image_folders]
source_categories = [*training_source_categories, *valid_source_categories]
# else: # Test set. # TODO

# Get image preprocessing.
self.image_preprocessing = get_value_list_from_dictionary(
"image_preprocessing", self.config,
'none | random_affine | random_horizontal_flip | normalize | all'.split(" | ")
)
if 'none' in self.image_preprocessing:
self.image_preprocessing = []
if 'all' in self.image_preprocessing:
self.image_preprocessing = 'random_affine | random_horizontal_flip | normalize'.split(" | ")
self.logger.info("Applied image preprocessing: {}".format(self.image_preprocessing))


# Get question preprocessing.
self.question_preprocessing = get_value_list_from_dictionary(
"question_preprocessing", self.config,
'none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all'.split(" | ")
)
if 'none' in self.question_preprocessing:
self.question_preprocessing = []
if 'all' in self.question_preprocessing:
self.question_preprocessing = 'lowercase | remove_punctuation | tokenize | remove_stop_words | shuffle_words'.split(" | ")
self.logger.info("Applied question preprocessing: {}".format(self.question_preprocessing))
# Load dataset.
self.dataset = self.load_dataset(source_files, source_image_folders, source_categories)

# Get answer preprocessing.
self.answer_preprocessing = get_value_list_from_dictionary(
"answer_preprocessing", self.config,
'none | lowercase | remove_punctuation | tokenize | all'.split(" | ")
)
if 'none' in self.answer_preprocessing:
self.answer_preprocessing = []
if 'all' in self.answer_preprocessing:
self.answer_preprocessing = 'lowercase | remove_punctuation | tokenize '.split(" | ")
self.logger.info("Applied answer preprocessing: {}".format(self.answer_preprocessing))


# Load dataset.
self.logger.info("Loading dataset from files:\n {}".format(source_files))
self.dataset = self.load_dataset(source_files, source_image_folders, source_categories)
self.logger.info("Loaded dataset consisting of {} samples".format(len(self.dataset)))
else:
# Test set.
split_folder = os.path.join(self.data_folder, "ImageClef-2019-VQA-Med-Test")
# Set source file.
source_file = os.path.join(split_folder,"VQAMed2019_Test_Questions.txt")
# Set image folder.
source_image_folder = os.path.join(split_folder, 'VQAMed2019_Test_Images')
self.dataset = self.load_testset(source_file, source_image_folder)

# Display exemplary sample.
self.logger.info("Exemplary sample:\n [ category: {}\t image_ids: {}\t question: {}\t answer: {} ]".format(
Expand Down Expand Up @@ -492,6 +504,7 @@ def load_dataset(self, source_files, source_image_folders, source_categories):

:param source_categories: List of categories associated with each of those files. (<UNK> unknown)
"""
self.logger.info("Loading dataset from files:\n {}".format(source_files))
# Set containing list of tuples.
dataset = []

Expand Down Expand Up @@ -542,6 +555,67 @@ def load_dataset(self, source_files, source_image_folders, source_categories):
t.update()
t.close()

self.logger.info("Loaded dataset consisting of {} samples".format(len(dataset)))
# Return the created list.
return dataset


def load_testset(self, data_file, image_folder):
"""
Loads the test set.

:param data_file: Source file.

:param image_folder: Folder containing image files.

"""
# Set containing list of tuples.
dataset = []
category_id = 5 # <UNK>
answer = '<UNK>'

# Set absolute path to file.
self.logger.info('Loading test set from {}...'.format(data_file))
# Load file content using '|' separator.
df = pd.read_csv(filepath_or_buffer=data_file, sep='|',header=None,
names=[self.key_image_ids,self.key_questions])

# Add tdqm bar.
t = tqdm.tqdm(total=len(df.index))
for _, row in df.iterrows():
# Retrieve question and answer.
question = row[self.key_questions]

# Process question - if required.
preprocessed_question = self.preprocess_text(
question,
'lowercase' in self.question_preprocessing,
'remove_punctuation' in self.question_preprocessing,
'tokenize' in self.question_preprocessing,
'remove_stop_words' in self.question_preprocessing
)

# Process answer - if required.
if 'tokenize' in self.answer_preprocessing:
preprocessed_answer = [answer]
else:
preprocessed_answer = answer

# Add record to dataset.
dataset.append({
# Image name and path leading to it.
self.key_image_ids: row[self.key_image_ids],
"image_folder": image_folder,
self.key_questions: preprocessed_question,
self.key_answers: preprocessed_answer,
# Add category.
self.key_category_ids: category_id
})

t.update()
t.close()

self.logger.info("Loaded dataset consisting of {} samples".format(len(dataset)))
# Return the created list.
return dataset

Expand Down
2 changes: 2 additions & 0 deletions ptp/components/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from .bleu_statistics import BLEUStatistics
from .global_variable_publisher import GlobalVariablePublisher
from .precision_recall_statistics import PrecisionRecallStatistics
from .stream_file_exporter import StreamFileExporter

__all__ = [
'AccuracyStatistics',
'BatchSizeStatistics',
'BLEUStatistics',
'GlobalVariablePublisher',
'PrecisionRecallStatistics',
'StreamFileExporter',
]
Loading