Skip to content

Commit

Permalink
Merge 845255b into 2ff3d0c
Browse files Browse the repository at this point in the history
  • Loading branch information
lukehsiao committed Jul 17, 2018
2 parents 2ff3d0c + 845255b commit a51c903
Show file tree
Hide file tree
Showing 15 changed files with 226 additions and 218 deletions.
4 changes: 2 additions & 2 deletions fonduer/__init__.py
Expand Up @@ -22,7 +22,7 @@
from fonduer.learning import GenerativeModel, SparseLogisticRegression
from fonduer.meta import Meta
from fonduer.parser import OmniParser
from fonduer.parser.models import Document, Figure, Phrase
from fonduer.parser.models import Document, Figure, Sentence
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.supervision.annotations import load_gold_labels
from fonduer.supervision.async_annotations import (
Expand Down Expand Up @@ -56,9 +56,9 @@
"OrganizationMatcher",
"PDFPreprocessor",
"PersonMatcher",
"Phrase",
"RegexMatchEach",
"RegexMatchSpan",
"Sentence",
"SparseLogisticRegression",
"Union",
"__version__",
Expand Down
10 changes: 5 additions & 5 deletions fonduer/candidates/candidates.py
Expand Up @@ -180,7 +180,7 @@ def __init__(
def apply(self, context, clear, split, **kwargs):
"""Extract candidates from the given Context.
Here, we define a context as a Phrase.
Here, we define a context as a Sentence.
:param context:
:param clear:
:param split: Which split to use.
Expand Down Expand Up @@ -247,7 +247,7 @@ class OmniNgrams(Ngrams):
Defines the space of candidates.
Defines the space of candidates as all n-grams (n <= n_max) in a Document _x_,
divided into Phrases inside of html elements (such as table cells).
divided into Sentences inside of html elements (such as table cells).
"""

def __init__(self, n_max=5, split_tokens=["-", "/"]):
Expand All @@ -258,16 +258,16 @@ def __init__(self, n_max=5, split_tokens=["-", "/"]):

def apply(self, session, context):
"""
Generate OmniNgrams from a Document by parsing all of its Phrases.
Generate OmniNgrams from a Document by parsing all of its Sentences.
"""
if not isinstance(context, Document):
raise TypeError(
"Input Contexts to OmniNgrams.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == context.id).one()
for phrase in doc.phrases:
for ts in Ngrams.apply(self, phrase):
for sentence in doc.sentences:
for ts in Ngrams.apply(self, sentence):
yield ts


Expand Down
2 changes: 1 addition & 1 deletion fonduer/candidates/matchers.py
Expand Up @@ -123,7 +123,7 @@ def init(self):
w.lower() if self.ignore_case else w for w in self.opts["d"]
)
except KeyError:
raise Exception("Please supply a dictionary (list of phrases) d as d=d.")
raise Exception("Please supply a dictionary (list of sentences) d as d=d.")

# Optionally use a stemmer, preprocess the dictionary
# Note that user can provide *an object having a stem() method*
Expand Down
12 changes: 6 additions & 6 deletions fonduer/features/table_features.py
Expand Up @@ -69,22 +69,22 @@ def tablelib_unary_features(span):
"""
if not span.sentence.is_tabular():
return
phrase = span.sentence
sentence = span.sentence
for attrib in settings.featurization.table.unary_features.attrib:
for ngram in get_cell_ngrams(
span,
n_max=settings.featurization.table.unary_features.get_cell_ngrams.max,
attrib=attrib,
):
yield "CELL_%s_[%s]" % (attrib.upper(), ngram), DEF_VALUE
for row_num in range(phrase.row_start, phrase.row_end + 1):
for row_num in range(sentence.row_start, sentence.row_end + 1):
yield "ROW_NUM_[%s]" % row_num, DEF_VALUE
for col_num in range(phrase.col_start, phrase.col_end + 1):
for col_num in range(sentence.col_start, sentence.col_end + 1):
yield "COL_NUM_[%s]" % col_num, DEF_VALUE
# NOTE: These two features could be accounted for by HTML_ATTR in
# structural features
yield "ROW_SPAN_[%d]" % num_rows(phrase), DEF_VALUE
yield "COL_SPAN_[%d]" % num_cols(phrase), DEF_VALUE
yield "ROW_SPAN_[%d]" % num_rows(sentence), DEF_VALUE
yield "COL_SPAN_[%d]" % num_cols(sentence), DEF_VALUE
for axis in ["row", "col"]:
for ngram in get_head_ngrams(
span,
Expand Down Expand Up @@ -148,7 +148,7 @@ def tablelib_binary_features(span1, span2):
span1.char_start - span2.char_start
), DEF_VALUE
if span1.sentence == span2.sentence:
yield u"SAME_PHRASE", DEF_VALUE
yield u"SAME_SENTENCE", DEF_VALUE
else:
if span1.sentence.cell is not None and span2.sentence.cell is not None:
yield u"DIFF_TABLE", DEF_VALUE
Expand Down
8 changes: 4 additions & 4 deletions fonduer/parser/models/__init__.py
@@ -1,20 +1,20 @@
from fonduer.parser.models.context import Context, construct_stable_id, split_stable_id
from fonduer.parser.models.document import Document
from fonduer.parser.models.figure import Figure
from fonduer.parser.models.phrase import Phrase
from fonduer.parser.models.paragraph import Paragraph
from fonduer.parser.models.sentence import Sentence
from fonduer.parser.models.section import Section
from fonduer.parser.models.table import Cell, Table
from fonduer.parser.models.webpage import Webpage
from fonduer.parser.models.section import Section
from fonduer.parser.models.paragraph import Paragraph

__all__ = [
"Cell",
"Context",
"Document",
"Figure",
"Paragraph",
"Phrase",
"Section",
"Sentence",
"Table",
"Webpage",
"construct_stable_id",
Expand Down
Expand Up @@ -11,8 +11,8 @@
STR_ARRAY_TYPE = postgresql.ARRAY(String)


class PhraseMixin(object):
"""A phrase Context in a Document."""
class SentenceMixin(object):
"""A sentence Context in a Document."""

def is_lingual(self):
return False
Expand All @@ -27,8 +27,8 @@ def is_structural(self):
return False

def __repr__(self):
return "Phrase (Doc: {}, Index: {}, Text: {})".format(
self.document.name, self.phrase_idx, self.text
return "Sentence (Doc: {}, Index: {}, Text: {})".format(
self.document.name, self.sentence_idx, self.text
)


Expand All @@ -45,8 +45,8 @@ def is_lingual(self):
return self.lemmas is not None

def __repr__(self):
return "LingualPhrase (Doc: {}, Index: {}, Text: {})".format(
self.document.name, self.phrase_idx, self.text
return "LingualSentence (Doc: {}, Index: {}, Text: {})".format(
self.document.name, self.sentence_idx, self.text
)


Expand All @@ -61,7 +61,7 @@ def table_id(cls):
def table(cls):
return relationship(
"Table",
backref=backref("phrases", cascade="all, delete-orphan"),
backref=backref("sentences", cascade="all, delete-orphan"),
foreign_keys=lambda: cls.table_id,
)

Expand All @@ -73,7 +73,7 @@ def cell_id(cls):
def cell(cls):
return relationship(
"Cell",
backref=backref("phrases", cascade="all, delete-orphan"),
backref=backref("sentences", cascade="all, delete-orphan"),
foreign_keys=lambda: cls.cell_id,
)

Expand All @@ -100,12 +100,12 @@ def __repr__(self):
if self.col_start != self.col_end
else self.col_start
)
return "TabularPhrase (Doc: {}, Table: {}, Row: {}, Col: {}, Index: {}, Text: {})".format(
return "TabularSentence (Doc: {}, Table: {}, Row: {}, Col: {}, Index: {}, Text: {})".format(
self.document.name,
(lambda: self.table).position,
rows,
cols,
self.phrase_idx,
self.sentence_idx,
self.text,
)

Expand All @@ -123,7 +123,7 @@ def is_visual(self):
return self.page is not None and self.page[0] is not None

def __repr__(self):
return "VisualPhrase (Doc: {}, Page: {}, (T,B,L,R): ({},{},{},{}), Text: {})".format(
return "VisualSentence (Doc: {}, Page: {}, (T,B,L,R): ({},{},{},{}), Text: {})".format(
self.document.name,
self.page,
self.top,
Expand All @@ -145,37 +145,39 @@ def is_structural(self):
return self.html_tag is not None

def __repr__(self):
return "StructuralPhrase (Doc: {}, Tag: {}, Text: {})".format(
return "StructuralSentence (Doc: {}, Tag: {}, Text: {})".format(
self.document.name, self.html_tag, self.text
)


# PhraseMixin must come last in arguments to not ovewrite is_* methods
# class Phrase(Context, StructuralMixin, PhraseMixin): # Memex variant
class Phrase(
Context, TabularMixin, LingualMixin, VisualMixin, StructuralMixin, PhraseMixin
# SentenceMixin must come last in arguments to not ovewrite is_* methods
# class Sentence(Context, StructuralMixin, SentenceMixin): # Memex variant
class Sentence(
Context, TabularMixin, LingualMixin, VisualMixin, StructuralMixin, SentenceMixin
):
"""A Phrase subclass with Lingual, Tabular, Visual, and HTML attributes."""
"""A Sentence subclass with Lingual, Tabular, Visual, and HTML attributes."""

__tablename__ = "phrase"
__tablename__ = "sentence"
id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)
document_id = Column(Integer, ForeignKey("document.id"))
document = relationship(
"Document",
backref=backref("phrases", cascade="all, delete-orphan"),
backref=backref("sentences", cascade="all, delete-orphan"),
foreign_keys=document_id,
)
phrase_num = Column(Integer, nullable=False) # unique Phrase number per document
sentence_num = Column(
Integer, nullable=False
) # unique sentence number per document
text = Column(Text, nullable=False)
words = Column(STR_ARRAY_TYPE)
char_offsets = Column(INT_ARRAY_TYPE)
entity_cids = Column(STR_ARRAY_TYPE)
entity_types = Column(STR_ARRAY_TYPE)
abs_char_offsets = Column(INT_ARRAY_TYPE)

__mapper_args__ = {"polymorphic_identity": "phrase"}
__mapper_args__ = {"polymorphic_identity": "sentence"}

__table_args__ = (UniqueConstraint(document_id, phrase_num),)
__table_args__ = (UniqueConstraint(document_id, sentence_num),)

def __repr__(self):
if self.is_tabular():
Expand All @@ -189,25 +191,25 @@ def __repr__(self):
if self.col_start != self.col_end
else self.col_start
)
return "Phrase (Doc: '{}', Table: {}, Row: {}, Col: {}, Index: {}, Text: '{}')".format(
return "Sentence (Doc: '{}', Table: {}, Row: {}, Col: {}, Index: {}, Text: '{}')".format(
self.document.name,
self.table.position,
rows,
cols,
self.phrase_num,
self.sentence_num,
self.text,
)
else:
return "Phrase (Doc: '{}', Index: {}, Text: '{}')".format(
self.document.name, self.phrase_num, self.text
return "Sentence (Doc: '{}', Index: {}, Text: '{}')".format(
self.document.name, self.sentence_num, self.text
)

def _asdict(self):
return {
# base
"id": self.id,
# 'document': self.document,
"phrase_num": self.phrase_num,
"sentence_num": self.sentence_num,
"text": self.text,
"entity_cids": self.entity_cids,
"entity_types": self.entity_types,
Expand Down
30 changes: 16 additions & 14 deletions fonduer/parser/parser.py
Expand Up @@ -14,7 +14,7 @@
Context,
Document,
Figure,
Phrase,
Sentence,
Table,
construct_stable_id,
split_stable_id,
Expand Down Expand Up @@ -144,7 +144,7 @@ def apply(self, x, **kwargs):
if missing_pdf:
logger.error("Visual parsing failed: pdf files are required")
yield from self.vizlink.parse_visual(
document.name, document.phrases, self.pdf_path
document.name, document.sentences, self.pdf_path
)
else:
yield from self.parse_structure(document, text)
Expand Down Expand Up @@ -190,8 +190,8 @@ def parse_structure(self, document, text):
self.parsed = 0
self.parent_idx = 0
self.position = 0
self.phrase_num = 0
self.abs_phrase_offset = 0
self.sentence_num = 0
self.abs_sentence_offset = 0

def parse_node(node, table_info=None, figure_info=None):
if node.tag is etree.Comment:
Expand Down Expand Up @@ -224,19 +224,19 @@ def parse_node(node, table_info=None, figure_info=None):
(_, _, _, char_end) = split_stable_id(parts["stable_id"])
try:
parts["document"] = document
parts["phrase_num"] = self.phrase_num
abs_phrase_offset_end = (
self.abs_phrase_offset
parts["sentence_num"] = self.sentence_num
abs_sentence_offset_end = (
self.abs_sentence_offset
+ parts["char_offsets"][-1]
+ len(parts["words"][-1])
)
parts["stable_id"] = construct_stable_id(
document,
"phrase",
self.abs_phrase_offset,
abs_phrase_offset_end,
"sentence",
self.abs_sentence_offset,
abs_sentence_offset_end,
)
self.abs_phrase_offset = abs_phrase_offset_end
self.abs_sentence_offset = abs_sentence_offset_end
if self.structural:
context_node = (
node.getparent() if field == "tail" else node
Expand Down Expand Up @@ -300,9 +300,9 @@ def parse_node(node, table_info=None, figure_info=None):
parts = table_info.apply_tabular(
parts, parent, self.position
)
yield Phrase(**parts)
yield Sentence(**parts)
self.position += 1
self.phrase_num += 1
self.sentence_num += 1
except Exception as e:
# This should never happen
logger.exception(str(e))
Expand Down Expand Up @@ -431,7 +431,9 @@ def apply_tabular(self, parts, parent, position):
parts["col_start"] = parent.col_start
parts["col_end"] = parent.col_end
else:
raise NotImplementedError("Phrase parent must be Document, Table, or Cell")
raise NotImplementedError(
"Sentence parent must be Document, Table, or Cell"
)
return parts


Expand Down
2 changes: 1 addition & 1 deletion fonduer/parser/simple_tokenizer.py
Expand Up @@ -28,7 +28,7 @@ def parse(self, document, contents):
int(_) for _ in np.cumsum([len(x) + 1 for x in words])[:-1]
]
text = " ".join(words)
stable_id = construct_stable_id(document, "phrase", i, i)
stable_id = construct_stable_id(document, "sentence", i, i)
yield {
"text": text,
"words": words,
Expand Down

0 comments on commit a51c903

Please sign in to comment.