diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml b/configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml new file mode 100644 index 0000000..aea4592 --- /dev/null +++ b/configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml @@ -0,0 +1,42 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c1_classification/default_c1_classification.yml + +pipeline: + name: c1_classification_question_mimic_rnn + + # Questions encoding. + question_tokenizer: + type: SentenceTokenizer + priority: 1.1 + streams: + inputs: questions + outputs: tokenized_questions + + # Model 1: Embeddings + question_embeddings: + type: SentenceEmbeddings + priority: 1.2 + embeddings_size: 300 + pretrained_embeddings_file: mimic.fastText.no_clean.300d.pickled + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + lstm: + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + priority: 3 + initial_state: Zero + #num_layers: 5 + hidden_size: 50 + streams: + inputs: embedded_questions + globals: + input_size: embeddings_size + prediction_size: vocabulary_size_c1 + + #: pipeline diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml b/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml index 9242d43..4c1d3a6 100644 --- a/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml +++ b/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml @@ -2,8 +2,8 @@ default_configs: vqa_med_2019/c1_classification/default_c1_classification.yml pipeline: - name: vqa_med_c1_classification_question_rnn - + name: c1_classification_question_rnn + # Questions encoding. question_tokenizer: type: SentenceTokenizer diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py index f263e60..ef4f75c 100644 --- a/ptp/components/models/sentence_embeddings.py +++ b/ptp/components/models/sentence_embeddings.py @@ -70,7 +70,7 @@ def __init__(self, name, config): # Load the embeddings first. if self.config["pretrained_embeddings_file"] != '': - emb_vectors = emb.load_pretrained_glove_vectors(self.logger, self.data_folder, self.config["pretrained_embeddings_file"], self.word_to_ix, self.embeddings_size) + emb_vectors = emb.load_pretrained_embeddings(self.logger, self.data_folder, self.config["pretrained_embeddings_file"], self.word_to_ix, self.embeddings_size) self.embeddings.weight = torch.nn.Parameter(emb_vectors) diff --git a/ptp/components/utils/embeddings.py b/ptp/components/utils/embeddings.py index bd7887a..6d22d71 100644 --- a/ptp/components/utils/embeddings.py +++ b/ptp/components/utils/embeddings.py @@ -20,11 +20,10 @@ import os import numpy as np import torch - import ptp.components.utils.io as io -def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, embeddings_size): +def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embeddings_size): """ Creates embedding vector for words from the provided (word:index) mappings (dictionary). @@ -40,6 +39,7 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e - glove.42B.300d.txt - glove.840B.300d.txt - glove.twitter.27B.txt + - fasttext.mimic.300d.txt :param logger: Logger object. @@ -67,41 +67,68 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e pretrained_embeddings_urls["glove.42B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.42B.300d.zip", "glove.42B.300d.zip") pretrained_embeddings_urls["glove.840B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.840B.300d.zip", "glove.840B.300d.zip") pretrained_embeddings_urls["glove.twitter.27B.txt"] = ("http://nlp.stanford.edu/data/glove.twitter.27B.zip", "glove.twitter.27B.zip") + pretrained_embeddings_urls["mimic.fastText.no_clean.300d.pickled"] = ("https://mednli.blob.core.windows.net/shared/word_embeddings/mimic.fastText.no_clean.300d.pickled","mimic.fastText.no_clean.300d.pickled") if (embeddings_name not in pretrained_embeddings_urls.keys()): logger.error("Cannot load the indicated pretrained embeddings (current '{}' must be one of {})".format(embeddings_name, pretrained_embeddings_urls.keys())) exit(1) - # Check presence of the file. - if not io.check_file_existence(folder, embeddings_name): - # Download and extract wikitext zip. - io.download_extract_zip_file(logger, folder, pretrained_embeddings_urls[embeddings_name][0], pretrained_embeddings_urls[embeddings_name][1]) - else: - logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder)) + logger.info("Initializing embeddings in folder {}".format(folder)) num_loaded_embs = 0 # Set random embeddings for words "out of vocabulary". # embeddings = np.zeros((len(word_to_ix), embeddings_size)) embeddings = np.random.normal(scale=0.6, size=(len(word_to_ix), embeddings_size)) + # Open the embeddings file. - with open(os.path.join(folder, embeddings_name)) as f: - # Parse file - for line in f.readlines(): - values = line.split() - # Get word. - word = values[0] - # Get index. - index = word_to_ix.get(word) - if index: - vector = np.array(values[1:], dtype='float32') + if embeddings_name == "mimic.fastText.no_clean.300d.pickled": + # Check if pickle exists. + file_name = pretrained_embeddings_urls[embeddings_name][1] + if not io.check_file_existence(folder, file_name): + # Try to download the pickle. + url = pretrained_embeddings_urls[embeddings_name][0] + logger.info("Downloading file '{}' from {}".format(file_name, url)) + io.download(folder, file_name, url) + else: + logger.info("File '{}' found in {}".format(embeddings_name, folder)) + + # Load word embeddings map. + word_embedding_map = io.load_pickle(logger, os.path.join(folder, file_name)) + # Iterate over map and cherry pick the vectors that fit our vocabulary. + for w, index in word_to_ix.items(): + if w in word_embedding_map: + vector = word_embedding_map[w] assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!" - # Ok, set vector. embeddings[index] = vector - # Increment counter. num_loaded_embs += 1 + + else: + + # Check presence of the file. + if not io.check_file_existence(folder, embeddings_name): + # Download and extract wikitext zip. + io.download_extract_zip_file(logger, folder, pretrained_embeddings_urls[embeddings_name][0], pretrained_embeddings_urls[embeddings_name][1]) + else: + logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder)) + + + with open(os.path.join(folder, embeddings_name)) as f: + # Parse file and cherry pick the vectors that fit our vocabulary. + for line in f.readlines(): + values = line.split() + # Get word. + word = values[0] + # Get index. + index = word_to_ix.get(word) + if index: + vector = np.array(values[1:], dtype='float32') + assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!" + # Ok, set vector. + embeddings[index] = vector + # Increment counter. + num_loaded_embs += 1 logger.info("Loaded {} pretrained embeddings for vocabulary of size {} from {}".format(num_loaded_embs, len(word_to_ix), embeddings_name)) # Return matrix with embeddings. return torch.from_numpy(embeddings).float() - diff --git a/ptp/components/utils/io.py b/ptp/components/utils/io.py index 30074f0..bbe7d58 100644 --- a/ptp/components/utils/io.py +++ b/ptp/components/utils/io.py @@ -22,6 +22,26 @@ import time import requests from pathlib import Path +import pickle + + +def load_pickle(logger, filename): + """ + Loads picke from file. + + :param logger: Logger object. + :param filename: Absolute path along with the name of the file to be loaded. + """ + try: + with open(str(filename), 'rb') as f: + obj = pickle.load(f) + logger.info('Loaded: %s', filename) + + except EOFError: + logger.warning('Cannot load: %s', filename) + obj = None + + return obj def save_nparray_to_csv_file(folder, filename, nparray, sep=','): @@ -189,13 +209,12 @@ def download(folder, filename, url): r = requests.get(url) content_length = int(r.headers.get('content-length', None)) count = 0 - + # Initialize download status. for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) count += 1 reporthook(count, 1024, content_length) - #self.logger.info('Downloading {}'.format(url)) def reporthook(count, block_size, total_size): @@ -230,13 +249,13 @@ def download_extract_zip_file(logger, folder, url, zipfile_name): logger.info("Initializing download in folder {}".format(folder)) if not check_file_existence(folder, zipfile_name): - logger.info("Downloading file {} from {}".format(zipfile_name, url)) + logger.info("Downloading file '{}' from {}".format(zipfile_name, url)) download(folder, zipfile_name, url) else: - logger.info("File {} found in {}".format(zipfile_name, folder)) + logger.info("File '{}' found in {}".format(zipfile_name, folder)) # Extract data from zip. - logger.info("Extracting data from {}".format(zipfile_name)) + logger.info("Extracting data from '{}'".format(zipfile_name)) with zipfile.ZipFile(os.path.join(folder, zipfile_name), 'r') as zip_ref: zip_ref.extractall(folder)