Merge f4d28e9 into e3b01eb

HazyResearch · Nov 27, 2018 · cae1475 · cae1475
2 parents e3b01eb + f4d28e9
commit cae1475
Show file tree

Hide file tree

Showing 27 changed files with 1,126 additions and 204 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -3,6 +3,37 @@
 
 Added
 ^^^^^
+* `@senwu`_: Add support to extract multimodal candidates and
+  add ``DoNothingMatcher`` matcher.
+  (`#184 <https://github.com/HazyResearch/fonduer/pull/184>`_)
+
+.. note::
+    The Mention extraction support all data types in data model. In Fonduer v0.3.6,
+    Mention extraction only supports ``MentionNgrams`` and ``MentionFigures``:
+
+    .. code:: python
+
+        from fonduer.candidates import (
+            MentionFigures,
+            MentionNgrams,
+        )
+
+    With this release, it supports all data types:
+
+    .. code:: python
+
+        from fonduer.candidates import (
+            MentionCaptions,
+            MentionCells,
+            MentionDocuments,
+            MentionFigures,
+            MentionNgrams,
+            MentionParagraphs,
+            MentionSections,
+            MentionSentences,
+            MentionTables,
+        )
+
 * `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and
   add name column for each context in data model.
   (`#182 <https://github.com/HazyResearch/fonduer/pull/182>`_)

diff --git a/src/fonduer/candidates/__init__.py b/src/fonduer/candidates/__init__.py
@@ -1,4 +1,27 @@
 from fonduer.candidates.candidates import CandidateExtractor
-from fonduer.candidates.mentions import MentionExtractor, MentionFigures, MentionNgrams
+from fonduer.candidates.mentions import (
+    MentionExtractor,
+    MentionFigures,
+    MentionNgrams,
+    MentionCaptions,
+    MentionCells,
+    MentionDocuments,
+    MentionParagraphs,
+    MentionSections,
+    MentionSentences,
+    MentionTables,
+)
 
-__all__ = ["CandidateExtractor", "MentionExtractor", "MentionFigures", "MentionNgrams"]
+__all__ = [
+    "CandidateExtractor",
+    "MentionExtractor",
+    "MentionFigures",
+    "MentionNgrams",
+    "MentionCaptions",
+    "MentionCells",
+    "MentionDocuments",
+    "MentionParagraphs",
+    "MentionSections",
+    "MentionSentences",
+    "MentionTables",
+]
diff --git a/src/fonduer/candidates/matchers.py b/src/fonduer/candidates/matchers.py
@@ -499,3 +499,9 @@ def init(self):
     def _f(self, m):
         """The internal (non-composed) version of filter function f"""
         return self.func(m)
+
+
+class DoNothingMatcher(_Matcher):
+    """Matcher class for doing nothing"""
+
+    pass
diff --git a/src/fonduer/candidates/mentions.py b/src/fonduer/candidates/mentions.py
@@ -8,8 +8,14 @@
 from sqlalchemy.sql import select
 
 from fonduer.candidates.models import Mention
-from fonduer.candidates.models.image import TemporaryImage
-from fonduer.candidates.models.span import TemporarySpan
+from fonduer.candidates.models.caption_mention import TemporaryCaptionMention
+from fonduer.candidates.models.cell_mention import TemporaryCellMention
+from fonduer.candidates.models.document_mention import TemporaryDocumentMention
+from fonduer.candidates.models.figure_mention import TemporaryFigureMention
+from fonduer.candidates.models.paragraph_mention import TemporaryParagraphMention
+from fonduer.candidates.models.section_mention import TemporarySectionMention
+from fonduer.candidates.models.span_mention import TemporarySpanMention
+from fonduer.candidates.models.table_mention import TemporaryTableMention
 from fonduer.parser.models import Document
 from fonduer.utils.udf import UDF, UDFRunner
 
@@ -69,7 +75,9 @@ def apply(self, context):
                 w = context.words[i + j - 1]
                 start = offsets[i]
                 end = offsets[i + j - 1] + len(w) - 1
-                ts = TemporarySpan(char_start=start, char_end=end, sentence=context)
+                ts = TemporarySpanMention(
+                    char_start=start, char_end=end, sentence=context
+                )
                 if ts not in seen:
                     seen.add(ts)
                     yield ts
@@ -92,7 +100,7 @@ def apply(self, context):
                     for start_idx in start_idxs:
                         for end_idx in end_idxs:
                             if start_idx < end_idx:
-                                ts = TemporarySpan(
+                                ts = TemporarySpanMention(
                                     char_start=start_idx,
                                     char_end=end_idx - 1,
                                     sentence=context,
@@ -147,7 +155,7 @@ class MentionFigures(MentionSpace):
     """Defines the space of Mentions as all figures in a Document *x*, indexing
     by **position offset**.
 
-    :param types: If specified, only yield TemporaryImages whose url ends in
+    :param types: If specified, only yield TemporaryFigureMentions whose url ends in
         one of the specified types. Example: types=["png", "jpg", "jpeg"].
     :type types: list, tuple of str
     """
@@ -179,7 +187,204 @@ def apply(self, session, doc):
             if self.types is None or any(
                 figure.url.lower().endswith(type) for type in self.types
             ):
-                yield TemporaryImage(figure)
+                yield TemporaryFigureMention(figure)
+
+
+class MentionSentences(MentionSpace):
+    """Defines the space of Mentions as all sentences in a Document *x*, indexing
+    by **position offset**.
+    """
+
+    def __init__(self):
+        """Initialize MentionSentences."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionSentences from a Document by parsing all of its Sentences.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionSentences.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        for sentence in doc.sentences:
+            yield TemporarySpanMention(
+                char_start=0, char_end=len(sentence.text) - 1, sentence=sentence
+            )
+
+
+class MentionParagraphs(MentionSpace):
+    """Defines the space of Mentions as all paragraphs in a Document *x*, indexing
+    by **position offset**.
+    """
+
+    def __init__(self):
+        """Initialize MentionParagraphs."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionParagraphs from a Document by parsing all of its Paragraphs.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionParagraphs.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        for paragraph in doc.paragraphs:
+            yield TemporaryParagraphMention(paragraph)
+
+
+class MentionCaptions(MentionSpace):
+    """Defines the space of Mentions as all captions in a Document *x*, indexing
+    by **position offset**.
+    """
+
+    def __init__(self):
+        """Initialize MentionCaptions."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionCaptions from a Document by parsing all of its Captions.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionCaptions.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        for caption in doc.captions:
+            yield TemporaryCaptionMention(caption)
+
+
+class MentionCells(MentionSpace):
+    """Defines the space of Mentions as all cells in a Document *x*, indexing
+    by **position offset**.
+    """
+
+    def __init__(self):
+        """Initialize MentionCells."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionCells from a Document by parsing all of its Cells.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionCells.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        for cell in doc.cells:
+            yield TemporaryCellMention(cell)
+
+
+class MentionTables(MentionSpace):
+    """Defines the space of Mentions as all tables in a Document *x*, indexing
+    by **position offset**.
+    """
+
+    def __init__(self):
+        """Initialize MentionTables."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionTables from a Document by parsing all of its Tables.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionTables.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        for table in doc.tables:
+            yield TemporaryTableMention(table)
+
+
+class MentionSections(MentionSpace):
+    """Defines the space of Mentions as all sections in a Document *x*, indexing
+    by **position offset**.
+    """
+
+    def __init__(self):
+        """Initialize MentionSections."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionSections from a Document by parsing all of its Sections.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionSections.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        for section in doc.sections:
+            yield TemporarySectionMention(section)
+
+
+class MentionDocuments(MentionSpace):
+    """Defines the space of Mentions as document in a Document *x*, indexing
+    by **document name**.
+    """
+
+    def __init__(self):
+        """Initialize MentionDocuments."""
+        MentionSpace.__init__(self)
+
+    def apply(self, session, doc):
+        """
+        Generate MentionDocuments from a Document by using document.
+
+        :param session: The database session
+        :param doc: The ``Document`` to parse.
+        :type doc: ``Document``.
+        :raises TypeError: If the input doc is not of type ``Document``.
+        """
+        if not isinstance(doc, Document):
+            raise TypeError(
+                "Input Contexts to MentionDocuments.apply() must be of type Document"
+            )
+
+        doc = session.query(Document).filter(Document.id == doc.id).one()
+        yield TemporaryDocumentMention(doc)
 
 
 class MentionExtractor(UDFRunner):

diff --git a/src/fonduer/candidates/models/__init__.py b/src/fonduer/candidates/models/__init__.py
@@ -1,15 +1,27 @@
 from fonduer.candidates.models.candidate import Candidate, candidate_subclass
-from fonduer.candidates.models.image import Image
-from fonduer.candidates.models.implicitspan import ImplicitSpan
+from fonduer.candidates.models.figure_mention import FigureMention
+from fonduer.candidates.models.implicit_span_mention import ImplicitSpanMention
 from fonduer.candidates.models.mention import Mention, mention_subclass
-from fonduer.candidates.models.span import Span
+from fonduer.candidates.models.span_mention import SpanMention
+from fonduer.candidates.models.caption_mention import CaptionMention
+from fonduer.candidates.models.cell_mention import CellMention
+from fonduer.candidates.models.document_mention import DocumentMention
+from fonduer.candidates.models.paragraph_mention import ParagraphMention
+from fonduer.candidates.models.section_mention import SectionMention
+from fonduer.candidates.models.table_mention import TableMention
 
 __all__ = [
     "Candidate",
-    "Image",
-    "ImplicitSpan",
+    "FigureMention",
+    "ImplicitSpanMention",
     "Mention",
-    "Span",
+    "SpanMention",
     "candidate_subclass",
     "mention_subclass",
+    "CaptionMention",
+    "CellMention",
+    "DocumentMention",
+    "ParagraphMention",
+    "SectionMention",
+    "TableMention",
 ]