clean up babi

IBM · Nov 20, 2018 · 0c052d4 · 0c052d4
1 parent ddb720b
commit 0c052d4
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 89 deletions.
diff --git a/miprometheus/problems/question_context_to_class/babiqa_dataset_single_question.py b/miprometheus/problems/question_context_to_class/babiqa_dataset_single_question.py
@@ -22,14 +22,17 @@
 from miprometheus.utils.problems_utils.language import Language
 import torch.utils.data
 from tqdm import tqdm
+import requests
 import os
 from miprometheus.utils.app_state import AppState
 from miprometheus.problems.seq_to_seq.seq_to_seq_problem import SeqToSeqProblem
+from miprometheus.problems.seq_to_seq.text2text.text_to_text_problem import TextToTextProblem
+
 from miprometheus.utils.loss.masked_cross_entropy_loss import MaskedCrossEntropyLoss
 
 
 
-class BABI(SeqToSeqProblem):
+class bAbIQASingleQuestion(TextToTextProblem):
     """
     Problem Class for loading bAbi QA data set using Torchtext
     
@@ -45,9 +48,9 @@ def __init__(self, params):
         :param params: Dictionary of parameters (read from configuration file).
         
         """
-        super(BABI).__init__()
+        super(bAbIQASingleQuestion).__init__()
 
-        self.directory = './'
+        self.directory = '~/data/babi/'
 
         # boolean: is it training phase?
         self.data_type = params['data_type']
@@ -65,11 +68,9 @@ def __init__(self, params):
 
         self.batch_size = params['batch_size']
 
-        self.memory_size = params['truncation_length']
-
         self.embedding_type = params['embedding_type']
 
-        self.embedding_size = 38
+        self.embedding_size = params['embedding_size']
 
         self.init_token = '<sos>'
 
@@ -92,9 +93,9 @@ def __init__(self, params):
 
         self.default_values = {'input_item_size': self.embedding_size , 'output_item_size':self.embedding_size}
 
-        self.data_definitions = {'sequences': {'size': [-1, -1, self.memory_size], 'type': [torch.Tensor]},
+        self.data_definitions = {'sequences': {'size': [-1, -1, self.embedding_size], 'type': [torch.Tensor]},
                                  'targets': {'size': [-1], 'type': [torch.Tensor]},
-                                 'current_question': {'size': [-1, 1], 'type': [list, str]},
+                                 'current_questions': {'size': [-1, 1], 'type': [list, str]},
                                  'masks': {'size': [-1], 'type': [torch.Tensor]},
                                  }
 
@@ -348,7 +349,7 @@ def download_from_url(self, url, path):
                 if chunk:
                     f.write(chunk)
 
-    def load_data(self, path=None, root='.data', tasks=[1], tenK=False, add_punctuation=True, data_type='train',
+    def load_data(self, path=None, root='data', tasks=[1], tenK=False, add_punctuation=True, data_type='train',
                   outmod=''):
 
         """loads all asked for tasks into a single file (combining multiple files) and then parses the combined file"""
@@ -482,11 +483,11 @@ def parse(self, file_data, add_punctuation):
 
     babi_tasks = list(range(1, 21))
 
-    params = {'directory': '/', 'tasks': babi_tasks,'data_type': 'train', 'batch_size': 10,'embedding_type' :'glove.6B.100d', 'ten_thousand_examples': True, 'one_hot_embedding': True, 'truncation_length':50 }
+    params = {'directory': '/', 'tasks': babi_tasks,'data_type': 'train', 'batch_size': 10,'embedding_type' :'glove.6B.100d', 'embedding_size' :38 , 'ten_thousand_examples': True, 'one_hot_embedding': True, 'truncation_length':50 }
 
 
 
-    babi = BABI(params)
+    babi = bAbIQASingleQuestion(params)
     sample=babi[12]
     print(sample)
     print('__getitem__ works.')

diff --git a/miprometheus/problems/seq_to_seq/seq_to_seq_problem.py b/miprometheus/problems/seq_to_seq/seq_to_seq_problem.py
@@ -24,7 +24,7 @@
 
 from miprometheus.problems.problem import Problem
 import torch
-from miprometheus.utils.app_state import AppState
+
 
 
 
@@ -77,84 +77,7 @@ def evaluate_loss(self, data_dict, logits):
         return loss
 
 
-    def to_dictionary_indexes(self, dictionary, sentence):
-        """
-        Outputs indexes of the dictionary corresponding to the words in the sequence.
-        Case insensitive.
-        """
-
-        idxs = torch.tensor([dictionary[w.lower()] for w in sentence]).type(AppState().LongTensor)
-        return idxs
-
-    def indices_to_words(self, int_sentence):
-
-        sentences = []
-        for ind in int_sentence[0, :]:
-            sentences.append(self.itos_dict[ind])
-        return sentences
 
-    def embed_sentence_one_hot(self, sentence):
-        """
-        Embed an entire sentence using a pretrained embedding
-        :param sentence: A string containing the words to embed
-        :returns: FloatTensor of embedded vectors [max_sentence_length, embedding size]
-        """
-        size_hot = len(self.dictionaries)
-        outsentence = torch.zeros((len(sentence.split(" ")), size_hot))
-        # for key, value in self.dictionaries.items():
-        #    print(key, value)
-
-        # print(size_hot)
-        # embed a word at a time
-        for i, word in enumerate(sentence.split(" ")):
-            if not word.lower() == self.pad_token:
-                index = self.dictionaries[word.lower()]
-                # print(index, word)
-                outsentence[i, index] = 1
-                # print(outsentence[i,:])
-
-        return outsentence
-
-        # Change name to embed sentence
-
-    def embed_batch(self, minibatch):
-
-        ex = minibatch
-        sentence = " ".join(ex)
-
-        if self.one_hot_embedding:
-            sent_embed = self.embed_sentence_one_hot(sentence)
-        else:
-            sent_embed = self.language.embed_sentence(sentence)
-
-        return sent_embed
-
-    def tokenize(self, sentence):
-        return sentence.split(' ')
-
-        # list to string
-
-    def detokenize_story(self, minibatch):
-        a = []
-        for ex in minibatch:
-            b = []
-            # print(ex)
-            for sentence in ex:
-                b.append(" ".join(sentence))
-            a.append(b)
-        return a
-
-        # string to list
-
-    def tokenize_story(self, minibatch):
-        a = []
-        for ex in minibatch:
-            b = []
-            # print(ex)
-            for sentence in ex:
-                b.append(self.tokenize(sentence))
-            a.append(b)
-        return a
 
 
 if __name__ == '__main__':

diff --git a/miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py b/miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py
@@ -51,6 +51,7 @@
 import torch
 import torch.nn as nn
 from miprometheus.problems.seq_to_seq.seq_to_seq_problem import SeqToSeqProblem
+from miprometheus.utils.app_state import AppState
 
 # global tokens
 PAD_token = 0
@@ -326,6 +327,85 @@ def tensors_from_pairs(self, pairs, input_lang, output_lang):
         """
         return [self.tensors_from_pair(pair, input_lang, output_lang) for pair in pairs]
 
+    def to_dictionary_indexes(self, dictionary, sentence):
+        """
+        Outputs indexes of the dictionary corresponding to the words in the sequence.
+        Case insensitive.
+        """
+
+        idxs = torch.tensor([dictionary[w.lower()] for w in sentence]).type(AppState().LongTensor)
+        return idxs
+
+    def indices_to_words(self, int_sentence):
+
+        sentences = []
+        for ind in int_sentence[0, :]:
+            sentences.append(self.itos_dict[ind])
+        return sentences
+
+    def embed_sentence_one_hot(self, sentence):
+        """
+        Embed an entire sentence using a pretrained embedding
+        :param sentence: A string containing the words to embed
+        :returns: FloatTensor of embedded vectors [max_sentence_length, embedding size]
+        """
+        size_hot = len(self.dictionaries)
+        outsentence = torch.zeros((len(sentence.split(" ")), size_hot))
+        # for key, value in self.dictionaries.items():
+        #    print(key, value)
+
+        # print(size_hot)
+        # embed a word at a time
+        for i, word in enumerate(sentence.split(" ")):
+            if not word.lower() == self.pad_token:
+                index = self.dictionaries[word.lower()]
+                # print(index, word)
+                outsentence[i, index] = 1
+                # print(outsentence[i,:])
+
+        return outsentence
+
+        # Change name to embed sentence
+
+    def embed_batch(self, minibatch):
+
+        ex = minibatch
+        sentence = " ".join(ex)
+
+        if self.one_hot_embedding:
+            sent_embed = self.embed_sentence_one_hot(sentence)
+        else:
+            sent_embed = self.language.embed_sentence(sentence)
+
+        return sent_embed
+
+    def tokenize(self, sentence):
+        return sentence.split(' ')
+
+        # list to string
+
+    def detokenize_story(self, minibatch):
+        a = []
+        for ex in minibatch:
+            b = []
+            # print(ex)
+            for sentence in ex:
+                b.append(" ".join(sentence))
+            a.append(b)
+        return a
+
+        # string to list
+
+    def tokenize_story(self, minibatch):
+        a = []
+        for ex in minibatch:
+            b = []
+            # print(ex)
+            for sentence in ex:
+                b.append(self.tokenize(sentence))
+            a.append(b)
+        return a
+
 
 class Lang(object):
     """
@@ -395,3 +475,5 @@ def add_word(self, word):
 
         else:  # this word has been seen before, simply update its occurrence
             self.word2count[word] += 1
+
+