Skip to content
This repository was archived by the owner on Jul 18, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Load config defining problems for training, validation and testing.
default_configs: vqa_med_2019/c1_classification/default_c1_classification.yml

pipeline:
name: c1_classification_question_mimic_rnn

# Questions encoding.
question_tokenizer:
type: SentenceTokenizer
priority: 1.1
streams:
inputs: questions
outputs: tokenized_questions

# Model 1: Embeddings
question_embeddings:
type: SentenceEmbeddings
priority: 1.2
embeddings_size: 300
pretrained_embeddings_file: mimic.fastText.no_clean.300d.pickled
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions

# Model 2: RNN
lstm:
type: RecurrentNeuralNetwork
cell_type: LSTM
prediction_mode: Last
priority: 3
initial_state: Zero
#num_layers: 5
hidden_size: 50
streams:
inputs: embedded_questions
globals:
input_size: embeddings_size
prediction_size: vocabulary_size_c1

#: pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
default_configs: vqa_med_2019/c1_classification/default_c1_classification.yml

pipeline:
name: vqa_med_c1_classification_question_rnn

name: c1_classification_question_rnn
# Questions encoding.
question_tokenizer:
type: SentenceTokenizer
Expand Down
2 changes: 1 addition & 1 deletion ptp/components/models/sentence_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, name, config):

# Load the embeddings first.
if self.config["pretrained_embeddings_file"] != '':
emb_vectors = emb.load_pretrained_glove_vectors(self.logger, self.data_folder, self.config["pretrained_embeddings_file"], self.word_to_ix, self.embeddings_size)
emb_vectors = emb.load_pretrained_embeddings(self.logger, self.data_folder, self.config["pretrained_embeddings_file"], self.word_to_ix, self.embeddings_size)
self.embeddings.weight = torch.nn.Parameter(emb_vectors)


Expand Down
69 changes: 48 additions & 21 deletions ptp/components/utils/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
import os
import numpy as np
import torch

import ptp.components.utils.io as io


def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, embeddings_size):
def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix, embeddings_size):
"""
Creates embedding vector for words from the provided (word:index) mappings (dictionary).

Expand All @@ -40,6 +39,7 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e
- glove.42B.300d.txt
- glove.840B.300d.txt
- glove.twitter.27B.txt
- fasttext.mimic.300d.txt

:param logger: Logger object.

Expand Down Expand Up @@ -67,41 +67,68 @@ def load_pretrained_glove_vectors(logger, folder, embeddings_name, word_to_ix, e
pretrained_embeddings_urls["glove.42B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.42B.300d.zip", "glove.42B.300d.zip")
pretrained_embeddings_urls["glove.840B.300d.txt"] = ("http://nlp.stanford.edu/data/glove.840B.300d.zip", "glove.840B.300d.zip")
pretrained_embeddings_urls["glove.twitter.27B.txt"] = ("http://nlp.stanford.edu/data/glove.twitter.27B.zip", "glove.twitter.27B.zip")
pretrained_embeddings_urls["mimic.fastText.no_clean.300d.pickled"] = ("https://mednli.blob.core.windows.net/shared/word_embeddings/mimic.fastText.no_clean.300d.pickled","mimic.fastText.no_clean.300d.pickled")

if (embeddings_name not in pretrained_embeddings_urls.keys()):
logger.error("Cannot load the indicated pretrained embeddings (current '{}' must be one of {})".format(embeddings_name, pretrained_embeddings_urls.keys()))
exit(1)

# Check presence of the file.
if not io.check_file_existence(folder, embeddings_name):
# Download and extract wikitext zip.
io.download_extract_zip_file(logger, folder, pretrained_embeddings_urls[embeddings_name][0], pretrained_embeddings_urls[embeddings_name][1])
else:
logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder))
logger.info("Initializing embeddings in folder {}".format(folder))

num_loaded_embs = 0
# Set random embeddings for words "out of vocabulary".
# embeddings = np.zeros((len(word_to_ix), embeddings_size))
embeddings = np.random.normal(scale=0.6, size=(len(word_to_ix), embeddings_size))

# Open the embeddings file.
with open(os.path.join(folder, embeddings_name)) as f:
# Parse file
for line in f.readlines():
values = line.split()
# Get word.
word = values[0]
# Get index.
index = word_to_ix.get(word)
if index:
vector = np.array(values[1:], dtype='float32')
if embeddings_name == "mimic.fastText.no_clean.300d.pickled":
# Check if pickle exists.
file_name = pretrained_embeddings_urls[embeddings_name][1]
if not io.check_file_existence(folder, file_name):
# Try to download the pickle.
url = pretrained_embeddings_urls[embeddings_name][0]
logger.info("Downloading file '{}' from {}".format(file_name, url))
io.download(folder, file_name, url)
else:
logger.info("File '{}' found in {}".format(embeddings_name, folder))

# Load word embeddings map.
word_embedding_map = io.load_pickle(logger, os.path.join(folder, file_name))
# Iterate over map and cherry pick the vectors that fit our vocabulary.
for w, index in word_to_ix.items():
if w in word_embedding_map:
vector = word_embedding_map[w]
assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!"
# Ok, set vector.
embeddings[index] = vector
# Increment counter.
num_loaded_embs += 1

else:

# Check presence of the file.
if not io.check_file_existence(folder, embeddings_name):
# Download and extract wikitext zip.
io.download_extract_zip_file(logger, folder, pretrained_embeddings_urls[embeddings_name][0], pretrained_embeddings_urls[embeddings_name][1])
else:
logger.info("File '{}' containing pretrained embeddings found in '{}' folder".format(embeddings_name, folder))


with open(os.path.join(folder, embeddings_name)) as f:
# Parse file and cherry pick the vectors that fit our vocabulary.
for line in f.readlines():
values = line.split()
# Get word.
word = values[0]
# Get index.
index = word_to_ix.get(word)
if index:
vector = np.array(values[1:], dtype='float32')
assert (len(vector) == embeddings_size), "Embeddings size must be equal to the size of pretrained embeddings!"
# Ok, set vector.
embeddings[index] = vector
# Increment counter.
num_loaded_embs += 1

logger.info("Loaded {} pretrained embeddings for vocabulary of size {} from {}".format(num_loaded_embs, len(word_to_ix), embeddings_name))

# Return matrix with embeddings.
return torch.from_numpy(embeddings).float()

29 changes: 24 additions & 5 deletions ptp/components/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@
import time
import requests
from pathlib import Path
import pickle


def load_pickle(logger, filename):
"""
Loads picke from file.

:param logger: Logger object.
:param filename: Absolute path along with the name of the file to be loaded.
"""
try:
with open(str(filename), 'rb') as f:
obj = pickle.load(f)
logger.info('Loaded: %s', filename)

except EOFError:
logger.warning('Cannot load: %s', filename)
obj = None

return obj


def save_nparray_to_csv_file(folder, filename, nparray, sep=','):
Expand Down Expand Up @@ -189,13 +209,12 @@ def download(folder, filename, url):
r = requests.get(url)
content_length = int(r.headers.get('content-length', None))
count = 0

# Initialize download status.
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
count += 1
reporthook(count, 1024, content_length)

#self.logger.info('Downloading {}'.format(url))

def reporthook(count, block_size, total_size):
Expand Down Expand Up @@ -230,13 +249,13 @@ def download_extract_zip_file(logger, folder, url, zipfile_name):
logger.info("Initializing download in folder {}".format(folder))

if not check_file_existence(folder, zipfile_name):
logger.info("Downloading file {} from {}".format(zipfile_name, url))
logger.info("Downloading file '{}' from {}".format(zipfile_name, url))
download(folder, zipfile_name, url)
else:
logger.info("File {} found in {}".format(zipfile_name, folder))
logger.info("File '{}' found in {}".format(zipfile_name, folder))

# Extract data from zip.
logger.info("Extracting data from {}".format(zipfile_name))
logger.info("Extracting data from '{}'".format(zipfile_name))
with zipfile.ZipFile(os.path.join(folder, zipfile_name), 'r') as zip_ref:
zip_ref.extractall(folder)

Expand Down