From f503ac08d1754855ef95afeb716e96fc8eb60062 Mon Sep 17 00:00:00 2001 From: Chaitanya Shivade <43888804+cshivade@users.noreply.github.com> Date: Tue, 30 Apr 2019 11:23:38 -0700 Subject: [PATCH 1/3] Vectors on clinical data (mimic) Added code to pull pretrained vectors on mimic. --- ptp/components/utils/embeddings.py | 51 ++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/ptp/components/utils/embeddings.py b/ptp/components/utils/embeddings.py index bd7887a..4ba7c28 100644 --- a/ptp/components/utils/embeddings.py +++ b/ptp/components/utils/embeddings.py @@ -20,9 +20,22 @@ import os import numpy as np import torch - +import pickle import ptp.components.utils.io as io +def load_pickle(filename): + try: + with open(str(filename), 'rb') as f: + obj = pickle.load(f) + + logging.info('Loaded: %s', filename) + + except EOFError: + logging.warning('Cannot load: %s', filename) + obj = None + + return obj + def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, embeddings_size): """ @@ -67,6 +80,7 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e pretrained_embeddings_urls["glove.42B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.42B.300d.zip", "glove.42B.300d.zip") pretrained_embeddings_urls["glove.840B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.840B.300d.zip", "glove.840B.300d.zip") pretrained_embeddings_urls["glove.twitter.27B.txt"] = ("http://nlp.stanford.edu/data/glove.twitter.27B.zip", "glove.twitter.27B.zip") + pretrained_embeddings_urls["fasttext.mimic.300d.txt"] = ("https://mednli.blob.core.windows.net/shared/word_embeddings/mimic.fastText.no_clean.300d.pickled","mimic.fastText.no_clean.300d.pickled") if (embeddings_name not in pretrained_embeddings_urls.keys()): logger.error("Cannot load the indicated pretrained embeddings (current '{}' must be one of {})".format(embeddings_name, pretrained_embeddings_urls.keys())) @@ -84,24 +98,33 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e # embeddings = np.zeros((len(word_to_ix), embeddings_size)) embeddings = np.random.normal(scale=0.6, size=(len(word_to_ix), embeddings_size)) # Open the embeddings file. - with open(os.path.join(folder, embeddings_name)) as f: - # Parse file - for line in f.readlines(): - values = line.split() - # Get word. - word = values[0] - # Get index. - index = word_to_ix.get(word) - if index: - vector = np.array(values[1:], dtype='float32') + if embeddings_name == 'mimic': + word_embedding_map = load_pickle(os.path.join(folder, embeddings_name)) + for w, index in word_to_ix.items(): + if w in word_embedding_map: + vector = word_embedding_map[w] assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!" - # Ok, set vector. embeddings[index] = vector - # Increment counter. num_loaded_embs += 1 + + else: + with open(os.path.join(folder, embeddings_name)) as f: + # Parse file + for line in f.readlines(): + values = line.split() + # Get word. + word = values[0] + # Get index. + index = word_to_ix.get(word) + if index: + vector = np.array(values[1:], dtype='float32') + assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!" + # Ok, set vector. + embeddings[index] = vector + # Increment counter. + num_loaded_embs += 1 logger.info("Loaded {} pretrained embeddings for vocabulary of size {} from {}".format(num_loaded_embs, len(word_to_ix), embeddings_name)) # Return matrix with embeddings. return torch.from_numpy(embeddings).float() - From 8128855c4e873d2b634afc3cd2d8937121b1547f Mon Sep 17 00:00:00 2001 From: tkornut Date: Tue, 30 Apr 2019 17:01:48 -0700 Subject: [PATCH 2/3] making the mimic download/load work:] --- .../c1_classification_question_mimic_rnn.yml | 42 +++++++++++++++ .../c1_classification_question_rnn.yml | 4 +- ptp/components/models/sentence_embeddings.py | 2 +- ptp/components/utils/embeddings.py | 54 ++++++++++--------- ptp/components/utils/io.py | 35 +++++++++--- 5 files changed, 101 insertions(+), 36 deletions(-) create mode 100644 configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml b/configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml new file mode 100644 index 0000000..aea4592 --- /dev/null +++ b/configs/vqa_med_2019/c1_classification/c1_classification_question_mimic_rnn.yml @@ -0,0 +1,42 @@ +# Load config defining problems for training, validation and testing. +default_configs: vqa_med_2019/c1_classification/default_c1_classification.yml + +pipeline: + name: c1_classification_question_mimic_rnn + + # Questions encoding. + question_tokenizer: + type: SentenceTokenizer + priority: 1.1 + streams: + inputs: questions + outputs: tokenized_questions + + # Model 1: Embeddings + question_embeddings: + type: SentenceEmbeddings + priority: 1.2 + embeddings_size: 300 + pretrained_embeddings_file: mimic.fastText.no_clean.300d.pickled + data_folder: ~/data/vqa-med + word_mappings_file: questions.all.word.mappings.csv + streams: + inputs: tokenized_questions + outputs: embedded_questions + + # Model 2: RNN + lstm: + type: RecurrentNeuralNetwork + cell_type: LSTM + prediction_mode: Last + priority: 3 + initial_state: Zero + #num_layers: 5 + hidden_size: 50 + streams: + inputs: embedded_questions + globals: + input_size: embeddings_size + prediction_size: vocabulary_size_c1 + + #: pipeline diff --git a/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml b/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml index 9242d43..4c1d3a6 100644 --- a/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml +++ b/configs/vqa_med_2019/c1_classification/c1_classification_question_rnn.yml @@ -2,8 +2,8 @@ default_configs: vqa_med_2019/c1_classification/default_c1_classification.yml pipeline: - name: vqa_med_c1_classification_question_rnn - + name: c1_classification_question_rnn + # Questions encoding. question_tokenizer: type: SentenceTokenizer diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py index f263e60..ef4f75c 100644 --- a/ptp/components/models/sentence_embeddings.py +++ b/ptp/components/models/sentence_embeddings.py @@ -70,7 +70,7 @@ def __init__(self, name, config): # Load the embeddings first. if self.config["pretrained_embeddings_file"] != '': - emb_vectors = emb.load_pretrained_glove_vectors(self.logger, self.data_folder, self.config["pretrained_embeddings_file"], self.word_to_ix, self.embeddings_size) + emb_vectors = emb.load_pretrained_embeddings(self.logger, self.data_folder, self.config["pretrained_embeddings_file"], self.word_to_ix, self.embeddings_size) self.embeddings.weight = torch.nn.Parameter(emb_vectors) diff --git a/ptp/components/utils/embeddings.py b/ptp/components/utils/embeddings.py index 4ba7c28..6d22d71 100644 --- a/ptp/components/utils/embeddings.py +++ b/ptp/components/utils/embeddings.py @@ -20,24 +20,10 @@ import os import numpy as np import torch -import pickle import ptp.components.utils.io as io -def load_pickle(filename): - try: - with open(str(filename), 'rb') as f: - obj = pickle.load(f) - logging.info('Loaded: %s', filename) - - except EOFError: - logging.warning('Cannot load: %s', filename) - obj = None - - return obj - - -def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, embeddings_size): +def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embeddings_size): """ Creates embedding vector for words from the provided (word:index) mappings (dictionary). @@ -53,6 +39,7 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e - glove.42B.300d.txt - glove.840B.300d.txt - glove.twitter.27B.txt + - fasttext.mimic.300d.txt :param logger: Logger object. @@ -80,26 +67,34 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e pretrained_embeddings_urls["glove.42B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.42B.300d.zip", "glove.42B.300d.zip") pretrained_embeddings_urls["glove.840B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.840B.300d.zip", "glove.840B.300d.zip") pretrained_embeddings_urls["glove.twitter.27B.txt"] = ("http://nlp.stanford.edu/data/glove.twitter.27B.zip", "glove.twitter.27B.zip") - pretrained_embeddings_urls["fasttext.mimic.300d.txt"] = ("https://mednli.blob.core.windows.net/shared/word_embeddings/mimic.fastText.no_clean.300d.pickled","mimic.fastText.no_clean.300d.pickled") + pretrained_embeddings_urls["mimic.fastText.no_clean.300d.pickled"] = ("https://mednli.blob.core.windows.net/shared/word_embeddings/mimic.fastText.no_clean.300d.pickled","mimic.fastText.no_clean.300d.pickled") if (embeddings_name not in pretrained_embeddings_urls.keys()): logger.error("Cannot load the indicated pretrained embeddings (current '{}' must be one of {})".format(embeddings_name, pretrained_embeddings_urls.keys())) exit(1) - # Check presence of the file. - if not io.check_file_existence(folder, embeddings_name): - # Download and extract wikitext zip. - io.download_extract_zip_file(logger, folder, pretrained_embeddings_urls[embeddings_name][0], pretrained_embeddings_urls[embeddings_name][1]) - else: - logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder)) + logger.info("Initializing embeddings in folder {}".format(folder)) num_loaded_embs = 0 # Set random embeddings for words "out of vocabulary". # embeddings = np.zeros((len(word_to_ix), embeddings_size)) embeddings = np.random.normal(scale=0.6, size=(len(word_to_ix), embeddings_size)) + # Open the embeddings file. - if embeddings_name == 'mimic': - word_embedding_map = load_pickle(os.path.join(folder, embeddings_name)) + if embeddings_name == "mimic.fastText.no_clean.300d.pickled": + # Check if pickle exists. + file_name = pretrained_embeddings_urls[embeddings_name][1] + if not io.check_file_existence(folder, file_name): + # Try to download the pickle. + url = pretrained_embeddings_urls[embeddings_name][0] + logger.info("Downloading file '{}' from {}".format(file_name, url)) + io.download(folder, file_name, url) + else: + logger.info("File '{}' found in {}".format(embeddings_name, folder)) + + # Load word embeddings map. + word_embedding_map = io.load_pickle(logger, os.path.join(folder, file_name)) + # Iterate over map and cherry pick the vectors that fit our vocabulary. for w, index in word_to_ix.items(): if w in word_embedding_map: vector = word_embedding_map[w] @@ -108,8 +103,17 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e num_loaded_embs += 1 else: + + # Check presence of the file. + if not io.check_file_existence(folder, embeddings_name): + # Download and extract wikitext zip. + io.download_extract_zip_file(logger, folder, pretrained_embeddings_urls[embeddings_name][0], pretrained_embeddings_urls[embeddings_name][1]) + else: + logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder)) + + with open(os.path.join(folder, embeddings_name)) as f: - # Parse file + # Parse file and cherry pick the vectors that fit our vocabulary. for line in f.readlines(): values = line.split() # Get word. diff --git a/ptp/components/utils/io.py b/ptp/components/utils/io.py index 30074f0..364968b 100644 --- a/ptp/components/utils/io.py +++ b/ptp/components/utils/io.py @@ -22,6 +22,26 @@ import time import requests from pathlib import Path +import pickle + + +def load_pickle(logger, filename): + """ + Loads picke from file. + + :param logger: Logger object. + :param filename: Absolute path along with the name of the file to be loaded. + """ + try: + with open(str(filename), 'rb') as f: + obj = pickle.load(f) + logger.info('Loaded: %s', filename) + + except EOFError: + logger.warning('Cannot load: %s', filename) + obj = None + + return obj def save_nparray_to_csv_file(folder, filename, nparray, sep=','): @@ -189,12 +209,11 @@ def download(folder, filename, url): r = requests.get(url) content_length = int(r.headers.get('content-length', None)) count = 0 - + # Initialize download status. for chunk in r.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - count += 1 - reporthook(count, 1024, content_length) + f.write(chunk) + count += 1 + reporthook(count, 1024, content_length) #self.logger.info('Downloading {}'.format(url)) @@ -230,13 +249,13 @@ def download_extract_zip_file(logger, folder, url, zipfile_name): logger.info("Initializing download in folder {}".format(folder)) if not check_file_existence(folder, zipfile_name): - logger.info("Downloading file {} from {}".format(zipfile_name, url)) + logger.info("Downloading file '{}' from {}".format(zipfile_name, url)) download(folder, zipfile_name, url) else: - logger.info("File {} found in {}".format(zipfile_name, folder)) + logger.info("File '{}' found in {}".format(zipfile_name, folder)) # Extract data from zip. - logger.info("Extracting data from {}".format(zipfile_name)) + logger.info("Extracting data from '{}'".format(zipfile_name)) with zipfile.ZipFile(os.path.join(folder, zipfile_name), 'r') as zip_ref: zip_ref.extractall(folder) From 3a635b611997376ced98a1b495d0fb79ec706ffd Mon Sep 17 00:00:00 2001 From: tkornut Date: Tue, 30 Apr 2019 17:03:31 -0700 Subject: [PATCH 3/3] rever to Alexis download --- ptp/components/utils/io.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ptp/components/utils/io.py b/ptp/components/utils/io.py index 364968b..bbe7d58 100644 --- a/ptp/components/utils/io.py +++ b/ptp/components/utils/io.py @@ -211,10 +211,10 @@ def download(folder, filename, url): count = 0 # Initialize download status. for chunk in r.iter_content(chunk_size=1024): - f.write(chunk) - count += 1 - reporthook(count, 1024, content_length) - + if chunk: + f.write(chunk) + count += 1 + reporthook(count, 1024, content_length) #self.logger.info('Downloading {}'.format(url)) def reporthook(count, block_size, total_size):