Merge 845255b into 2ff3d0c

HazyResearch · Jul 17, 2018 · a51c903 · a51c903
2 parents 2ff3d0c + 845255b
commit a51c903
Show file tree

Hide file tree

Showing 15 changed files with 226 additions and 218 deletions.
diff --git a/fonduer/__init__.py b/fonduer/__init__.py
@@ -22,7 +22,7 @@
 from fonduer.learning import GenerativeModel, SparseLogisticRegression
 from fonduer.meta import Meta
 from fonduer.parser import OmniParser
-from fonduer.parser.models import Document, Figure, Phrase
+from fonduer.parser.models import Document, Figure, Sentence
 from fonduer.parser.preprocessors import HTMLDocPreprocessor
 from fonduer.supervision.annotations import load_gold_labels
 from fonduer.supervision.async_annotations import (
@@ -56,9 +56,9 @@
     "OrganizationMatcher",
     "PDFPreprocessor",
     "PersonMatcher",
-    "Phrase",
     "RegexMatchEach",
     "RegexMatchSpan",
+    "Sentence",
     "SparseLogisticRegression",
     "Union",
     "__version__",

diff --git a/fonduer/candidates/candidates.py b/fonduer/candidates/candidates.py
@@ -180,7 +180,7 @@ def __init__(
     def apply(self, context, clear, split, **kwargs):
         """Extract candidates from the given Context.
 
-        Here, we define a context as a Phrase.
+        Here, we define a context as a Sentence.
         :param context:
         :param clear:
         :param split: Which split to use.
@@ -247,7 +247,7 @@ class OmniNgrams(Ngrams):
     Defines the space of candidates.
 
     Defines the space of candidates as all n-grams (n <= n_max) in a Document _x_,
-    divided into Phrases inside of html elements (such as table cells).
+    divided into Sentences inside of html elements (such as table cells).
     """
 
     def __init__(self, n_max=5, split_tokens=["-", "/"]):
@@ -258,16 +258,16 @@ def __init__(self, n_max=5, split_tokens=["-", "/"]):
 
     def apply(self, session, context):
         """
-        Generate OmniNgrams from a Document by parsing all of its Phrases.
+        Generate OmniNgrams from a Document by parsing all of its Sentences.
         """
         if not isinstance(context, Document):
             raise TypeError(
                 "Input Contexts to OmniNgrams.apply() must be of type Document"
             )
 
         doc = session.query(Document).filter(Document.id == context.id).one()
-        for phrase in doc.phrases:
-            for ts in Ngrams.apply(self, phrase):
+        for sentence in doc.sentences:
+            for ts in Ngrams.apply(self, sentence):
                 yield ts
 
 

diff --git a/fonduer/candidates/matchers.py b/fonduer/candidates/matchers.py
@@ -123,7 +123,7 @@ def init(self):
                 w.lower() if self.ignore_case else w for w in self.opts["d"]
             )
         except KeyError:
-            raise Exception("Please supply a dictionary (list of phrases) d as d=d.")
+            raise Exception("Please supply a dictionary (list of sentences) d as d=d.")
 
         # Optionally use a stemmer, preprocess the dictionary
         # Note that user can provide *an object having a stem() method*

diff --git a/fonduer/features/table_features.py b/fonduer/features/table_features.py
@@ -69,22 +69,22 @@ def tablelib_unary_features(span):
     """
     if not span.sentence.is_tabular():
         return
-    phrase = span.sentence
+    sentence = span.sentence
     for attrib in settings.featurization.table.unary_features.attrib:
         for ngram in get_cell_ngrams(
             span,
             n_max=settings.featurization.table.unary_features.get_cell_ngrams.max,
             attrib=attrib,
         ):
             yield "CELL_%s_[%s]" % (attrib.upper(), ngram), DEF_VALUE
-        for row_num in range(phrase.row_start, phrase.row_end + 1):
+        for row_num in range(sentence.row_start, sentence.row_end + 1):
             yield "ROW_NUM_[%s]" % row_num, DEF_VALUE
-        for col_num in range(phrase.col_start, phrase.col_end + 1):
+        for col_num in range(sentence.col_start, sentence.col_end + 1):
             yield "COL_NUM_[%s]" % col_num, DEF_VALUE
         # NOTE: These two features could be accounted for by HTML_ATTR in
         # structural features
-        yield "ROW_SPAN_[%d]" % num_rows(phrase), DEF_VALUE
-        yield "COL_SPAN_[%d]" % num_cols(phrase), DEF_VALUE
+        yield "ROW_SPAN_[%d]" % num_rows(sentence), DEF_VALUE
+        yield "COL_SPAN_[%d]" % num_cols(sentence), DEF_VALUE
         for axis in ["row", "col"]:
             for ngram in get_head_ngrams(
                 span,
@@ -148,7 +148,7 @@ def tablelib_binary_features(span1, span2):
                         span1.char_start - span2.char_start
                     ), DEF_VALUE
                     if span1.sentence == span2.sentence:
-                        yield u"SAME_PHRASE", DEF_VALUE
+                        yield u"SAME_SENTENCE", DEF_VALUE
         else:
             if span1.sentence.cell is not None and span2.sentence.cell is not None:
                 yield u"DIFF_TABLE", DEF_VALUE

diff --git a/fonduer/parser/models/__init__.py b/fonduer/parser/models/__init__.py
@@ -1,20 +1,20 @@
 from fonduer.parser.models.context import Context, construct_stable_id, split_stable_id
 from fonduer.parser.models.document import Document
 from fonduer.parser.models.figure import Figure
-from fonduer.parser.models.phrase import Phrase
+from fonduer.parser.models.paragraph import Paragraph
+from fonduer.parser.models.sentence import Sentence
+from fonduer.parser.models.section import Section
 from fonduer.parser.models.table import Cell, Table
 from fonduer.parser.models.webpage import Webpage
-from fonduer.parser.models.section import Section
-from fonduer.parser.models.paragraph import Paragraph
 
 __all__ = [
     "Cell",
     "Context",
     "Document",
     "Figure",
     "Paragraph",
-    "Phrase",
     "Section",
+    "Sentence",
     "Table",
     "Webpage",
     "construct_stable_id",

diff --git a/fonduer/parser/models/phrase.py → fonduer/parser/models/sentence.py b/fonduer/parser/models/phrase.py → fonduer/parser/models/sentence.py
@@ -11,8 +11,8 @@
 STR_ARRAY_TYPE = postgresql.ARRAY(String)
 
 
-class PhraseMixin(object):
-    """A phrase Context in a Document."""
+class SentenceMixin(object):
+    """A sentence Context in a Document."""
 
     def is_lingual(self):
         return False
@@ -27,8 +27,8 @@ def is_structural(self):
         return False
 
     def __repr__(self):
-        return "Phrase (Doc: {}, Index: {}, Text: {})".format(
-            self.document.name, self.phrase_idx, self.text
+        return "Sentence (Doc: {}, Index: {}, Text: {})".format(
+            self.document.name, self.sentence_idx, self.text
         )
 
 
@@ -45,8 +45,8 @@ def is_lingual(self):
         return self.lemmas is not None
 
     def __repr__(self):
-        return "LingualPhrase (Doc: {}, Index: {}, Text: {})".format(
-            self.document.name, self.phrase_idx, self.text
+        return "LingualSentence (Doc: {}, Index: {}, Text: {})".format(
+            self.document.name, self.sentence_idx, self.text
         )
 
 
@@ -61,7 +61,7 @@ def table_id(cls):
     def table(cls):
         return relationship(
             "Table",
-            backref=backref("phrases", cascade="all, delete-orphan"),
+            backref=backref("sentences", cascade="all, delete-orphan"),
             foreign_keys=lambda: cls.table_id,
         )
 
@@ -73,7 +73,7 @@ def cell_id(cls):
     def cell(cls):
         return relationship(
             "Cell",
-            backref=backref("phrases", cascade="all, delete-orphan"),
+            backref=backref("sentences", cascade="all, delete-orphan"),
             foreign_keys=lambda: cls.cell_id,
         )
 
@@ -100,12 +100,12 @@ def __repr__(self):
             if self.col_start != self.col_end
             else self.col_start
         )
-        return "TabularPhrase (Doc: {}, Table: {}, Row: {}, Col: {}, Index: {}, Text: {})".format(
+        return "TabularSentence (Doc: {}, Table: {}, Row: {}, Col: {}, Index: {}, Text: {})".format(
             self.document.name,
             (lambda: self.table).position,
             rows,
             cols,
-            self.phrase_idx,
+            self.sentence_idx,
             self.text,
         )
 
@@ -123,7 +123,7 @@ def is_visual(self):
         return self.page is not None and self.page[0] is not None
 
     def __repr__(self):
-        return "VisualPhrase (Doc: {}, Page: {}, (T,B,L,R): ({},{},{},{}), Text: {})".format(
+        return "VisualSentence (Doc: {}, Page: {}, (T,B,L,R): ({},{},{},{}), Text: {})".format(
             self.document.name,
             self.page,
             self.top,
@@ -145,37 +145,39 @@ def is_structural(self):
         return self.html_tag is not None
 
     def __repr__(self):
-        return "StructuralPhrase (Doc: {}, Tag: {}, Text: {})".format(
+        return "StructuralSentence (Doc: {}, Tag: {}, Text: {})".format(
             self.document.name, self.html_tag, self.text
         )
 
 
-# PhraseMixin must come last in arguments to not ovewrite is_* methods
-# class Phrase(Context, StructuralMixin, PhraseMixin): # Memex variant
-class Phrase(
-    Context, TabularMixin, LingualMixin, VisualMixin, StructuralMixin, PhraseMixin
+# SentenceMixin must come last in arguments to not ovewrite is_* methods
+# class Sentence(Context, StructuralMixin, SentenceMixin): # Memex variant
+class Sentence(
+    Context, TabularMixin, LingualMixin, VisualMixin, StructuralMixin, SentenceMixin
 ):
-    """A Phrase subclass with Lingual, Tabular, Visual, and HTML attributes."""
+    """A Sentence subclass with Lingual, Tabular, Visual, and HTML attributes."""
 
-    __tablename__ = "phrase"
+    __tablename__ = "sentence"
     id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)
     document_id = Column(Integer, ForeignKey("document.id"))
     document = relationship(
         "Document",
-        backref=backref("phrases", cascade="all, delete-orphan"),
+        backref=backref("sentences", cascade="all, delete-orphan"),
         foreign_keys=document_id,
     )
-    phrase_num = Column(Integer, nullable=False)  # unique Phrase number per document
+    sentence_num = Column(
+        Integer, nullable=False
+    )  # unique sentence number per document
     text = Column(Text, nullable=False)
     words = Column(STR_ARRAY_TYPE)
     char_offsets = Column(INT_ARRAY_TYPE)
     entity_cids = Column(STR_ARRAY_TYPE)
     entity_types = Column(STR_ARRAY_TYPE)
     abs_char_offsets = Column(INT_ARRAY_TYPE)
 
-    __mapper_args__ = {"polymorphic_identity": "phrase"}
+    __mapper_args__ = {"polymorphic_identity": "sentence"}
 
-    __table_args__ = (UniqueConstraint(document_id, phrase_num),)
+    __table_args__ = (UniqueConstraint(document_id, sentence_num),)
 
     def __repr__(self):
         if self.is_tabular():
@@ -189,25 +191,25 @@ def __repr__(self):
                 if self.col_start != self.col_end
                 else self.col_start
             )
-            return "Phrase (Doc: '{}', Table: {}, Row: {}, Col: {}, Index: {}, Text: '{}')".format(
+            return "Sentence (Doc: '{}', Table: {}, Row: {}, Col: {}, Index: {}, Text: '{}')".format(
                 self.document.name,
                 self.table.position,
                 rows,
                 cols,
-                self.phrase_num,
+                self.sentence_num,
                 self.text,
             )
         else:
-            return "Phrase (Doc: '{}', Index: {}, Text: '{}')".format(
-                self.document.name, self.phrase_num, self.text
+            return "Sentence (Doc: '{}', Index: {}, Text: '{}')".format(
+                self.document.name, self.sentence_num, self.text
             )
 
     def _asdict(self):
         return {
             # base
             "id": self.id,
             # 'document': self.document,
-            "phrase_num": self.phrase_num,
+            "sentence_num": self.sentence_num,
             "text": self.text,
             "entity_cids": self.entity_cids,
             "entity_types": self.entity_types,

diff --git a/fonduer/parser/parser.py b/fonduer/parser/parser.py
@@ -14,7 +14,7 @@
     Context,
     Document,
     Figure,
-    Phrase,
+    Sentence,
     Table,
     construct_stable_id,
     split_stable_id,
@@ -144,7 +144,7 @@ def apply(self, x, **kwargs):
             if missing_pdf:
                 logger.error("Visual parsing failed: pdf files are required")
             yield from self.vizlink.parse_visual(
-                document.name, document.phrases, self.pdf_path
+                document.name, document.sentences, self.pdf_path
             )
         else:
             yield from self.parse_structure(document, text)
@@ -190,8 +190,8 @@ def parse_structure(self, document, text):
         self.parsed = 0
         self.parent_idx = 0
         self.position = 0
-        self.phrase_num = 0
-        self.abs_phrase_offset = 0
+        self.sentence_num = 0
+        self.abs_sentence_offset = 0
 
         def parse_node(node, table_info=None, figure_info=None):
             if node.tag is etree.Comment:
@@ -224,19 +224,19 @@ def parse_node(node, table_info=None, figure_info=None):
                             (_, _, _, char_end) = split_stable_id(parts["stable_id"])
                             try:
                                 parts["document"] = document
-                                parts["phrase_num"] = self.phrase_num
-                                abs_phrase_offset_end = (
-                                    self.abs_phrase_offset
+                                parts["sentence_num"] = self.sentence_num
+                                abs_sentence_offset_end = (
+                                    self.abs_sentence_offset
                                     + parts["char_offsets"][-1]
                                     + len(parts["words"][-1])
                                 )
                                 parts["stable_id"] = construct_stable_id(
                                     document,
-                                    "phrase",
-                                    self.abs_phrase_offset,
-                                    abs_phrase_offset_end,
+                                    "sentence",
+                                    self.abs_sentence_offset,
+                                    abs_sentence_offset_end,
                                 )
-                                self.abs_phrase_offset = abs_phrase_offset_end
+                                self.abs_sentence_offset = abs_sentence_offset_end
                                 if self.structural:
                                     context_node = (
                                         node.getparent() if field == "tail" else node
@@ -300,9 +300,9 @@ def parse_node(node, table_info=None, figure_info=None):
                                     parts = table_info.apply_tabular(
                                         parts, parent, self.position
                                     )
-                                yield Phrase(**parts)
+                                yield Sentence(**parts)
                                 self.position += 1
-                                self.phrase_num += 1
+                                self.sentence_num += 1
                             except Exception as e:
                                 # This should never happen
                                 logger.exception(str(e))
@@ -431,7 +431,9 @@ def apply_tabular(self, parts, parent, position):
             parts["col_start"] = parent.col_start
             parts["col_end"] = parent.col_end
         else:
-            raise NotImplementedError("Phrase parent must be Document, Table, or Cell")
+            raise NotImplementedError(
+                "Sentence parent must be Document, Table, or Cell"
+            )
         return parts
 
 

diff --git a/fonduer/parser/simple_tokenizer.py b/fonduer/parser/simple_tokenizer.py
@@ -28,7 +28,7 @@ def parse(self, document, contents):
                 int(_) for _ in np.cumsum([len(x) + 1 for x in words])[:-1]
             ]
             text = " ".join(words)
-            stable_id = construct_stable_id(document, "phrase", i, i)
+            stable_id = construct_stable_id(document, "sentence", i, i)
             yield {
                 "text": text,
                 "words": words,