Skip to content

Commit

Permalink
Merge f4d28e9 into e3b01eb
Browse files Browse the repository at this point in the history
  • Loading branch information
senwu committed Nov 27, 2018
2 parents e3b01eb + f4d28e9 commit cae1475
Show file tree
Hide file tree
Showing 27 changed files with 1,126 additions and 204 deletions.
31 changes: 31 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,37 @@

Added
^^^^^
* `@senwu`_: Add support to extract multimodal candidates and
add ``DoNothingMatcher`` matcher.
(`#184 <https://github.com/HazyResearch/fonduer/pull/184>`_)

.. note::
The Mention extraction support all data types in data model. In Fonduer v0.3.6,
Mention extraction only supports ``MentionNgrams`` and ``MentionFigures``:

.. code:: python
from fonduer.candidates import (
MentionFigures,
MentionNgrams,
)
With this release, it supports all data types:

.. code:: python
from fonduer.candidates import (
MentionCaptions,
MentionCells,
MentionDocuments,
MentionFigures,
MentionNgrams,
MentionParagraphs,
MentionSections,
MentionSentences,
MentionTables,
)
* `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and
add name column for each context in data model.
(`#182 <https://github.com/HazyResearch/fonduer/pull/182>`_)
Expand Down
27 changes: 25 additions & 2 deletions src/fonduer/candidates/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,27 @@
from fonduer.candidates.candidates import CandidateExtractor
from fonduer.candidates.mentions import MentionExtractor, MentionFigures, MentionNgrams
from fonduer.candidates.mentions import (
MentionExtractor,
MentionFigures,
MentionNgrams,
MentionCaptions,
MentionCells,
MentionDocuments,
MentionParagraphs,
MentionSections,
MentionSentences,
MentionTables,
)

__all__ = ["CandidateExtractor", "MentionExtractor", "MentionFigures", "MentionNgrams"]
__all__ = [
"CandidateExtractor",
"MentionExtractor",
"MentionFigures",
"MentionNgrams",
"MentionCaptions",
"MentionCells",
"MentionDocuments",
"MentionParagraphs",
"MentionSections",
"MentionSentences",
"MentionTables",
]
6 changes: 6 additions & 0 deletions src/fonduer/candidates/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,9 @@ def init(self):
def _f(self, m):
"""The internal (non-composed) version of filter function f"""
return self.func(m)


class DoNothingMatcher(_Matcher):
"""Matcher class for doing nothing"""

pass
217 changes: 211 additions & 6 deletions src/fonduer/candidates/mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@
from sqlalchemy.sql import select

from fonduer.candidates.models import Mention
from fonduer.candidates.models.image import TemporaryImage
from fonduer.candidates.models.span import TemporarySpan
from fonduer.candidates.models.caption_mention import TemporaryCaptionMention
from fonduer.candidates.models.cell_mention import TemporaryCellMention
from fonduer.candidates.models.document_mention import TemporaryDocumentMention
from fonduer.candidates.models.figure_mention import TemporaryFigureMention
from fonduer.candidates.models.paragraph_mention import TemporaryParagraphMention
from fonduer.candidates.models.section_mention import TemporarySectionMention
from fonduer.candidates.models.span_mention import TemporarySpanMention
from fonduer.candidates.models.table_mention import TemporaryTableMention
from fonduer.parser.models import Document
from fonduer.utils.udf import UDF, UDFRunner

Expand Down Expand Up @@ -69,7 +75,9 @@ def apply(self, context):
w = context.words[i + j - 1]
start = offsets[i]
end = offsets[i + j - 1] + len(w) - 1
ts = TemporarySpan(char_start=start, char_end=end, sentence=context)
ts = TemporarySpanMention(
char_start=start, char_end=end, sentence=context
)
if ts not in seen:
seen.add(ts)
yield ts
Expand All @@ -92,7 +100,7 @@ def apply(self, context):
for start_idx in start_idxs:
for end_idx in end_idxs:
if start_idx < end_idx:
ts = TemporarySpan(
ts = TemporarySpanMention(
char_start=start_idx,
char_end=end_idx - 1,
sentence=context,
Expand Down Expand Up @@ -147,7 +155,7 @@ class MentionFigures(MentionSpace):
"""Defines the space of Mentions as all figures in a Document *x*, indexing
by **position offset**.
:param types: If specified, only yield TemporaryImages whose url ends in
:param types: If specified, only yield TemporaryFigureMentions whose url ends in
one of the specified types. Example: types=["png", "jpg", "jpeg"].
:type types: list, tuple of str
"""
Expand Down Expand Up @@ -179,7 +187,204 @@ def apply(self, session, doc):
if self.types is None or any(
figure.url.lower().endswith(type) for type in self.types
):
yield TemporaryImage(figure)
yield TemporaryFigureMention(figure)


class MentionSentences(MentionSpace):
"""Defines the space of Mentions as all sentences in a Document *x*, indexing
by **position offset**.
"""

def __init__(self):
"""Initialize MentionSentences."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionSentences from a Document by parsing all of its Sentences.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionSentences.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
for sentence in doc.sentences:
yield TemporarySpanMention(
char_start=0, char_end=len(sentence.text) - 1, sentence=sentence
)


class MentionParagraphs(MentionSpace):
"""Defines the space of Mentions as all paragraphs in a Document *x*, indexing
by **position offset**.
"""

def __init__(self):
"""Initialize MentionParagraphs."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionParagraphs from a Document by parsing all of its Paragraphs.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionParagraphs.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
for paragraph in doc.paragraphs:
yield TemporaryParagraphMention(paragraph)


class MentionCaptions(MentionSpace):
"""Defines the space of Mentions as all captions in a Document *x*, indexing
by **position offset**.
"""

def __init__(self):
"""Initialize MentionCaptions."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionCaptions from a Document by parsing all of its Captions.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionCaptions.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
for caption in doc.captions:
yield TemporaryCaptionMention(caption)


class MentionCells(MentionSpace):
"""Defines the space of Mentions as all cells in a Document *x*, indexing
by **position offset**.
"""

def __init__(self):
"""Initialize MentionCells."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionCells from a Document by parsing all of its Cells.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionCells.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
for cell in doc.cells:
yield TemporaryCellMention(cell)


class MentionTables(MentionSpace):
"""Defines the space of Mentions as all tables in a Document *x*, indexing
by **position offset**.
"""

def __init__(self):
"""Initialize MentionTables."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionTables from a Document by parsing all of its Tables.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionTables.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
for table in doc.tables:
yield TemporaryTableMention(table)


class MentionSections(MentionSpace):
"""Defines the space of Mentions as all sections in a Document *x*, indexing
by **position offset**.
"""

def __init__(self):
"""Initialize MentionSections."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionSections from a Document by parsing all of its Sections.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionSections.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
for section in doc.sections:
yield TemporarySectionMention(section)


class MentionDocuments(MentionSpace):
"""Defines the space of Mentions as document in a Document *x*, indexing
by **document name**.
"""

def __init__(self):
"""Initialize MentionDocuments."""
MentionSpace.__init__(self)

def apply(self, session, doc):
"""
Generate MentionDocuments from a Document by using document.
:param session: The database session
:param doc: The ``Document`` to parse.
:type doc: ``Document``.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionDocuments.apply() must be of type Document"
)

doc = session.query(Document).filter(Document.id == doc.id).one()
yield TemporaryDocumentMention(doc)


class MentionExtractor(UDFRunner):
Expand Down
24 changes: 18 additions & 6 deletions src/fonduer/candidates/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,27 @@
from fonduer.candidates.models.candidate import Candidate, candidate_subclass
from fonduer.candidates.models.image import Image
from fonduer.candidates.models.implicitspan import ImplicitSpan
from fonduer.candidates.models.figure_mention import FigureMention
from fonduer.candidates.models.implicit_span_mention import ImplicitSpanMention
from fonduer.candidates.models.mention import Mention, mention_subclass
from fonduer.candidates.models.span import Span
from fonduer.candidates.models.span_mention import SpanMention
from fonduer.candidates.models.caption_mention import CaptionMention
from fonduer.candidates.models.cell_mention import CellMention
from fonduer.candidates.models.document_mention import DocumentMention
from fonduer.candidates.models.paragraph_mention import ParagraphMention
from fonduer.candidates.models.section_mention import SectionMention
from fonduer.candidates.models.table_mention import TableMention

__all__ = [
"Candidate",
"Image",
"ImplicitSpan",
"FigureMention",
"ImplicitSpanMention",
"Mention",
"Span",
"SpanMention",
"candidate_subclass",
"mention_subclass",
"CaptionMention",
"CellMention",
"DocumentMention",
"ParagraphMention",
"SectionMention",
"TableMention",
]

0 comments on commit cae1475

Please sign in to comment.