Renaming OmniParser to just Parser

HazyResearch · Jul 18, 2018 · 9a0b531 · 9a0b531
1 parent 0b2c1bf
commit 9a0b531
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 28 deletions.
diff --git a/fonduer/__init__.py b/fonduer/__init__.py
@@ -21,7 +21,7 @@
 from fonduer.candidates.models import candidate_subclass
 from fonduer.learning import GenerativeModel, SparseLogisticRegression
 from fonduer.meta import Meta
-from fonduer.parser import OmniParser
+from fonduer.parser import Parser
 from fonduer.parser.models import Document, Figure, Sentence, Table
 from fonduer.parser.preprocessors import HTMLDocPreprocessor
 from fonduer.supervision.annotations import load_gold_labels
@@ -52,9 +52,9 @@
     "NumberMatcher",
     "OmniFigures",
     "OmniNgrams",
-    "OmniParser",
     "OrganizationMatcher",
     "PDFPreprocessor",
+    "Parser",
     "PersonMatcher",
     "RegexMatchEach",
     "RegexMatchSpan",

diff --git a/fonduer/parser/__init__.py b/fonduer/parser/__init__.py
@@ -1,3 +1,3 @@
-from fonduer.parser.parser import OmniParser
+from fonduer.parser.parser import Parser
 
-__all__ = ["OmniParser"]
+__all__ = ["Parser"]
diff --git a/fonduer/parser/parser.py b/fonduer/parser/parser.py
@@ -26,7 +26,7 @@
 logger = logging.getLogger(__name__)
 
 
-class OmniParser(UDFRunner):
+class Parser(UDFRunner):
     def __init__(
         self,
         structural=True,  # structural information
@@ -43,8 +43,8 @@ def __init__(
         # Use spaCy as our lingual parser
         self.lingual_parser = Spacy()
 
-        super(OmniParser, self).__init__(
-            OmniParserUDF,
+        super(Parser, self).__init__(
+            ParserUDF,
             structural=structural,
             blacklist=blacklist,
             flatten=flatten,
@@ -65,7 +65,7 @@ def clear(self, session, **kwargs):
         session.query(Candidate).delete()
 
 
-class OmniParserUDF(UDF):
+class ParserUDF(UDF):
     def __init__(
         self,
         structural,
@@ -89,7 +89,7 @@ def __init__(
         a regex and _replace_ is a character string. All occurents of _pattern_ in the
         text will be replaced by _replace_.
         """
-        super(OmniParserUDF, self).__init__(**kwargs)
+        super(ParserUDF, self).__init__(**kwargs)
 
         # structural (html) setup
         self.structural = structural
@@ -125,7 +125,7 @@ def apply(self, x, **kwargs):
         if self.visual:
             if not self.pdf_path:
                 logger.error("Visual parsing failed: pdf_path is required")
-            for _ in self.parse_structure(document, text):
+            for _ in self.parse(document, text):
                 pass
             # Add visual attributes
             filename = self.pdf_path + document.name
@@ -141,7 +141,7 @@ def apply(self, x, **kwargs):
                 document.name, document.sentences, self.pdf_path
             )
         else:
-            yield from self.parse_structure(document, text)
+            yield from self.parse(document, text)
 
     def _flatten(self, node):
         """Construct a string containing the child's text/tail appended to the node.
@@ -433,7 +433,7 @@ def _parse_node(self, node, state):
         # Now, process the Sentence
         yield from self._parse_sentence(node, state)
 
-    def parse_structure(self, document, text):
+    def parse(self, document, text):
         """Depth-first search over the provided tree.
 
         Implemented as an iterative procedure. The structure of the state

diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py
@@ -21,7 +21,7 @@
     Intersect,
     LambdaFunctionMatcher,
     Meta,
-    OmniParser,
+    Parser,
     RegexMatchSpan,
     Sentence,
     SparseLogisticRegression,
@@ -57,7 +57,7 @@ def test_e2e(caplog):
 
     doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
 
-    corpus_parser = OmniParser(
+    corpus_parser = Parser(
         structural=True, lingual=True, visual=True, pdf_path=pdf_path
     )
     corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py
@@ -9,9 +9,9 @@
 import os
 
 from fonduer import Meta
-from fonduer.parser import OmniParser
+from fonduer.parser import Parser
 from fonduer.parser.models import Document, Sentence
-from fonduer.parser.parser import OmniParserUDF
+from fonduer.parser.parser import ParserUDF
 from fonduer.parser.preprocessors import HTMLDocPreprocessor
 from fonduer.parser.spacy_parser import Spacy
 
@@ -22,7 +22,7 @@ def test_parse_md_details(caplog):
     """Unit test of the final results stored in the database of the md document.
 
     This test only looks at the final results such that the implementation of
-    the OmniParserUDF's apply() can be modified.
+    the ParserUDF's apply() can be modified.
     """
     caplog.set_level(logging.INFO)
     logger = logging.getLogger(__name__)
@@ -36,8 +36,8 @@ def test_parse_md_details(caplog):
     # Preprocessor for the Docs
     preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
 
-    # Create an OmniParser and parse the md document
-    omni = OmniParser(
+    # Create an Parser and parse the md document
+    omni = Parser(
         structural=True, tabular=True, lingual=True, visual=True, pdf_path=pdf_path
     )
     omni.apply(preprocessor, parallelism=PARALLEL)
@@ -104,7 +104,7 @@ def test_parse_md_details(caplog):
 
 
 def test_simple_tokenizer(caplog):
-    """Unit test of OmniParser on a single document with lingual features off."""
+    """Unit test of Parser on a single document with lingual features off."""
     caplog.set_level(logging.INFO)
     logger = logging.getLogger(__name__)
     session = Meta.init("postgres://localhost:5432/" + ATTRIBUTE).Session()
@@ -122,7 +122,7 @@ def test_simple_tokenizer(caplog):
     # Preprocessor for the Docs
     preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
 
-    omni = OmniParser(structural=True, lingual=False, visual=True, pdf_path=pdf_path)
+    omni = Parser(structural=True, lingual=False, visual=True, pdf_path=pdf_path)
     omni.apply(preprocessor, parallelism=PARALLEL)
 
     doc = session.query(Document).order_by(Document.name).all()[1]
@@ -148,7 +148,7 @@ def test_simple_tokenizer(caplog):
 
 
 def test_parse_document_diseases(caplog):
-    """Unit test of OmniParser on a single document.
+    """Unit test of Parser on a single document.
 
     This tests both the structural and visual parse of the document.
     """
@@ -169,8 +169,8 @@ def test_parse_document_diseases(caplog):
     # Preprocessor for the Docs
     preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
 
-    # Create an OmniParser and parse the diseases document
-    omni = OmniParser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
+    # Create an Parser and parse the diseases document
+    omni = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
     omni.apply(preprocessor, parallelism=PARALLEL)
 
     # Grab the diseases document
@@ -241,7 +241,7 @@ def test_spacy_integration(caplog):
     max_docs = 2
     doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
 
-    corpus_parser = OmniParser(
+    corpus_parser = Parser(
         structural=True, lingual=True, visual=False, pdf_path=pdf_path
     )
     corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
@@ -276,14 +276,14 @@ def test_parse_style(caplog):
     # Preprocessor for the Docs
     preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
 
-    # Create an OmniParser and parse the md document
-    omni = OmniParser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
+    # Create an Parser and parse the md document
+    omni = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
     omni.apply(preprocessor, parallelism=PARALLEL)
 
     # Grab the document
     doc = session.query(Document).order_by(Document.name).all()[0]
 
-    # Grab the sentences parsed by the OmniParser
+    # Grab the sentences parsed by the Parser
     sentences = list(session.query(Sentence).order_by(Sentence.sentence_num).all())
 
     logger.warning("Doc: {}".format(doc))