Skip to content

Commit

Permalink
clean up babi
Browse files Browse the repository at this point in the history
  • Loading branch information
vincentalbouy committed Nov 20, 2018
1 parent ddb720b commit 0c052d4
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@
from miprometheus.utils.problems_utils.language import Language
import torch.utils.data
from tqdm import tqdm
import requests
import os
from miprometheus.utils.app_state import AppState
from miprometheus.problems.seq_to_seq.seq_to_seq_problem import SeqToSeqProblem
from miprometheus.problems.seq_to_seq.text2text.text_to_text_problem import TextToTextProblem

from miprometheus.utils.loss.masked_cross_entropy_loss import MaskedCrossEntropyLoss



class BABI(SeqToSeqProblem):
class bAbIQASingleQuestion(TextToTextProblem):
"""
Problem Class for loading bAbi QA data set using Torchtext
Expand All @@ -45,9 +48,9 @@ def __init__(self, params):
:param params: Dictionary of parameters (read from configuration file).
"""
super(BABI).__init__()
super(bAbIQASingleQuestion).__init__()

self.directory = './'
self.directory = '~/data/babi/'

# boolean: is it training phase?
self.data_type = params['data_type']
Expand All @@ -65,11 +68,9 @@ def __init__(self, params):

self.batch_size = params['batch_size']

self.memory_size = params['truncation_length']

self.embedding_type = params['embedding_type']

self.embedding_size = 38
self.embedding_size = params['embedding_size']

self.init_token = '<sos>'

Expand All @@ -92,9 +93,9 @@ def __init__(self, params):

self.default_values = {'input_item_size': self.embedding_size , 'output_item_size':self.embedding_size}

self.data_definitions = {'sequences': {'size': [-1, -1, self.memory_size], 'type': [torch.Tensor]},
self.data_definitions = {'sequences': {'size': [-1, -1, self.embedding_size], 'type': [torch.Tensor]},
'targets': {'size': [-1], 'type': [torch.Tensor]},
'current_question': {'size': [-1, 1], 'type': [list, str]},
'current_questions': {'size': [-1, 1], 'type': [list, str]},
'masks': {'size': [-1], 'type': [torch.Tensor]},
}

Expand Down Expand Up @@ -348,7 +349,7 @@ def download_from_url(self, url, path):
if chunk:
f.write(chunk)

def load_data(self, path=None, root='.data', tasks=[1], tenK=False, add_punctuation=True, data_type='train',
def load_data(self, path=None, root='data', tasks=[1], tenK=False, add_punctuation=True, data_type='train',
outmod=''):

"""loads all asked for tasks into a single file (combining multiple files) and then parses the combined file"""
Expand Down Expand Up @@ -482,11 +483,11 @@ def parse(self, file_data, add_punctuation):

babi_tasks = list(range(1, 21))

params = {'directory': '/', 'tasks': babi_tasks,'data_type': 'train', 'batch_size': 10,'embedding_type' :'glove.6B.100d', 'ten_thousand_examples': True, 'one_hot_embedding': True, 'truncation_length':50 }
params = {'directory': '/', 'tasks': babi_tasks,'data_type': 'train', 'batch_size': 10,'embedding_type' :'glove.6B.100d', 'embedding_size' :38 , 'ten_thousand_examples': True, 'one_hot_embedding': True, 'truncation_length':50 }



babi = BABI(params)
babi = bAbIQASingleQuestion(params)
sample=babi[12]
print(sample)
print('__getitem__ works.')
Expand Down
79 changes: 1 addition & 78 deletions miprometheus/problems/seq_to_seq/seq_to_seq_problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from miprometheus.problems.problem import Problem
import torch
from miprometheus.utils.app_state import AppState




Expand Down Expand Up @@ -77,84 +77,7 @@ def evaluate_loss(self, data_dict, logits):
return loss


def to_dictionary_indexes(self, dictionary, sentence):
"""
Outputs indexes of the dictionary corresponding to the words in the sequence.
Case insensitive.
"""

idxs = torch.tensor([dictionary[w.lower()] for w in sentence]).type(AppState().LongTensor)
return idxs

def indices_to_words(self, int_sentence):

sentences = []
for ind in int_sentence[0, :]:
sentences.append(self.itos_dict[ind])
return sentences

def embed_sentence_one_hot(self, sentence):
"""
Embed an entire sentence using a pretrained embedding
:param sentence: A string containing the words to embed
:returns: FloatTensor of embedded vectors [max_sentence_length, embedding size]
"""
size_hot = len(self.dictionaries)
outsentence = torch.zeros((len(sentence.split(" ")), size_hot))
# for key, value in self.dictionaries.items():
# print(key, value)

# print(size_hot)
# embed a word at a time
for i, word in enumerate(sentence.split(" ")):
if not word.lower() == self.pad_token:
index = self.dictionaries[word.lower()]
# print(index, word)
outsentence[i, index] = 1
# print(outsentence[i,:])

return outsentence

# Change name to embed sentence

def embed_batch(self, minibatch):

ex = minibatch
sentence = " ".join(ex)

if self.one_hot_embedding:
sent_embed = self.embed_sentence_one_hot(sentence)
else:
sent_embed = self.language.embed_sentence(sentence)

return sent_embed

def tokenize(self, sentence):
return sentence.split(' ')

# list to string

def detokenize_story(self, minibatch):
a = []
for ex in minibatch:
b = []
# print(ex)
for sentence in ex:
b.append(" ".join(sentence))
a.append(b)
return a

# string to list

def tokenize_story(self, minibatch):
a = []
for ex in minibatch:
b = []
# print(ex)
for sentence in ex:
b.append(self.tokenize(sentence))
a.append(b)
return a


if __name__ == '__main__':
Expand Down
82 changes: 82 additions & 0 deletions miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import torch
import torch.nn as nn
from miprometheus.problems.seq_to_seq.seq_to_seq_problem import SeqToSeqProblem
from miprometheus.utils.app_state import AppState

# global tokens
PAD_token = 0
Expand Down Expand Up @@ -326,6 +327,85 @@ def tensors_from_pairs(self, pairs, input_lang, output_lang):
"""
return [self.tensors_from_pair(pair, input_lang, output_lang) for pair in pairs]

def to_dictionary_indexes(self, dictionary, sentence):
"""
Outputs indexes of the dictionary corresponding to the words in the sequence.
Case insensitive.
"""

idxs = torch.tensor([dictionary[w.lower()] for w in sentence]).type(AppState().LongTensor)
return idxs

def indices_to_words(self, int_sentence):

sentences = []
for ind in int_sentence[0, :]:
sentences.append(self.itos_dict[ind])
return sentences

def embed_sentence_one_hot(self, sentence):
"""
Embed an entire sentence using a pretrained embedding
:param sentence: A string containing the words to embed
:returns: FloatTensor of embedded vectors [max_sentence_length, embedding size]
"""
size_hot = len(self.dictionaries)
outsentence = torch.zeros((len(sentence.split(" ")), size_hot))
# for key, value in self.dictionaries.items():
# print(key, value)

# print(size_hot)
# embed a word at a time
for i, word in enumerate(sentence.split(" ")):
if not word.lower() == self.pad_token:
index = self.dictionaries[word.lower()]
# print(index, word)
outsentence[i, index] = 1
# print(outsentence[i,:])

return outsentence

# Change name to embed sentence

def embed_batch(self, minibatch):

ex = minibatch
sentence = " ".join(ex)

if self.one_hot_embedding:
sent_embed = self.embed_sentence_one_hot(sentence)
else:
sent_embed = self.language.embed_sentence(sentence)

return sent_embed

def tokenize(self, sentence):
return sentence.split(' ')

# list to string

def detokenize_story(self, minibatch):
a = []
for ex in minibatch:
b = []
# print(ex)
for sentence in ex:
b.append(" ".join(sentence))
a.append(b)
return a

# string to list

def tokenize_story(self, minibatch):
a = []
for ex in minibatch:
b = []
# print(ex)
for sentence in ex:
b.append(self.tokenize(sentence))
a.append(b)
return a


class Lang(object):
"""
Expand Down Expand Up @@ -395,3 +475,5 @@ def add_word(self, word):

else: # this word has been seen before, simply update its occurrence
self.word2count[word] += 1


0 comments on commit 0c052d4

Please sign in to comment.