support csv format input

HazyResearch · Dec 18, 2018 · a0270af · a0270af
1 parent 19e2f81
commit a0270af
Show file tree

Hide file tree

Showing 6 changed files with 149 additions and 62 deletions.
diff --git a/src/fonduer/parser/preprocessors/__init__.py b/src/fonduer/parser/preprocessors/__init__.py
@@ -1,11 +1,11 @@
-from fonduer.parser.preprocessors.csv_paths_preprocessor import CSVPathsPreprocessor
+from fonduer.parser.preprocessors.csv_doc_preprocessor import CSVDocPreprocessor
 from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor
 from fonduer.parser.preprocessors.html_doc_preprocessor import HTMLDocPreprocessor
 from fonduer.parser.preprocessors.text_doc_preprocessor import TextDocPreprocessor
 from fonduer.parser.preprocessors.tsv_doc_preprocessor import TSVDocPreprocessor
 
 __all__ = [
-    "CSVPathsPreprocessor",
+    "CSVDocPreprocessor",
     "DocPreprocessor",
     "HTMLDocPreprocessor",
     "TSVDocPreprocessor",

diff --git a/src/fonduer/parser/preprocessors/csv_doc_preprocessor.py b/src/fonduer/parser/preprocessors/csv_doc_preprocessor.py
@@ -0,0 +1,99 @@
+import codecs
+import csv
+import os
+
+from fonduer.parser.models import Document
+from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor
+from fonduer.utils.utils_parser import build_node, column_constructor
+
+
+class CSVDocPreprocessor(DocPreprocessor):
+    """A generator which processes a CSV file or directory of CSV files into
+    a set of Document objects. It treats each line in the input file is a document.
+    This ``DocPreprocessor`` assumes that each column is one section and content in
+    each column as one paragraph as defalt. However, if the column is complex, an
+    advanced parser may be used by specifying ``parser_rule`` parameter, e,g.,
+    specify keywords as delimiters for paragraph in a dict format where key is the
+    column inedx and value is the keyword list.
+
+    :param path: filesystem path to file or directory to parse.
+    :type path: str
+    :param encoding: file encoding to use (e.g. "utf-8").
+    :type encoding: str
+    :param max_docs: the maximum number of ``Documents`` to produce.
+    :type max_docs: int
+    :param header: if the CSV file contain header or not, if yes, the header
+        will be used as Section name. default = False
+    :type header: bool
+    :param delim: delimiter to be used to separate columns when file has
+        more than one column. It is active only when ``column is not
+        None``. default=','
+    :type delim: int
+    :param parser_rule: The parser rule to be used to parse the specific column.
+        default = None
+    :rtype: A generator of ``Documents``.
+    """
+
+    def __init__(
+        self,
+        path,
+        encoding="utf-8",
+        max_docs=float("inf"),
+        header=False,
+        delim=",",
+        parser_rule=None,
+    ):
+        super(CSVDocPreprocessor, self).__init__(path, encoding, max_docs)
+        self.header = header
+        self.delim = delim
+        self.parser_rule = parser_rule
+        self.n_parsed = 0
+
+    def _parse_file(self, fp, file_name):
+        name = os.path.basename(fp)[: os.path.basename(fp).rfind(".")]
+        with codecs.open(fp, encoding=self.encoding) as f:
+            reader = csv.reader(f)
+
+            # Load CSV header
+            header_names = None
+            if self.header:
+                header_names = next(reader)
+
+            # Load document per row
+            for i, row in enumerate(reader):
+                if self.n_parsed == self.max_docs:
+                    break
+                sections = []
+                for j, content in enumerate(row):
+                    rule = (
+                        self.parser_rule[j]
+                        if self.parser_rule is not None and j in self.parser_rule
+                        else column_constructor
+                    )
+                    content_header = (
+                        header_names[j] if header_names is not None else None
+                    )
+                    context = [build_node(t, n, c) for t, n, c in rule(content)]
+                    sections.append(
+                        build_node("section", content_header, "".join(context))
+                    )
+
+                text = build_node("doc", None, "".join(sections))
+                doc_name = name + ":" + str(i)
+                stable_id = self._get_stable_id(doc_name)
+
+                yield Document(
+                    name=doc_name,
+                    stable_id=stable_id,
+                    text=text,
+                    meta={"file_name": file_name},
+                )
+                self.n_parsed += 1
+
+    def __len__(self):
+        """Provide a len attribute based on max_docs and number of files in folder."""
+        num_docs = min(len(self.all_files), self.max_docs)
+        return num_docs
+
+    def _can_read(self, fpath):
+        return fpath.lower().endswith(".csv")
diff --git a/src/fonduer/parser/preprocessors/csv_paths_preprocessor.py b/src/fonduer/parser/preprocessors/csv_paths_preprocessor.py
diff --git a/src/fonduer/parser/preprocessors/doc_preprocessor.py b/src/fonduer/parser/preprocessors/doc_preprocessor.py
@@ -7,10 +7,10 @@ class DocPreprocessor(object):
     A generator which processes a file or directory of files into a set of
     Document objects.
 
-    :param encoding: file encoding to use (e.g. "utf-8").
-    :type encoding: str
     :param path: filesystem path to file or directory to parse.
     :type path: str
+    :param encoding: file encoding to use (e.g. "utf-8").
+    :type encoding: str
     :param max_docs: the maximum number of ``Documents`` to produce.
     :type max_docs: int
     :rtype: A generator of ``Documents``.

diff --git a/src/fonduer/utils/udf.py b/src/fonduer/utils/udf.py
@@ -148,7 +148,7 @@ def fill_input_queue(in_queue, doc_loader, terminal_signal):
         count_parsed = 0
         while count_parsed < total_count:
             y = out_queue.get()
-            # Update progress bar whenever an item has been  processed
+            # Update progress bar whenever an item has been processed
             if y == UDF.TASK_DONE:
                 count_parsed += 1
                 if self.pb is not None:

diff --git a/src/fonduer/utils/utils_parser.py b/src/fonduer/utils/utils_parser.py
@@ -0,0 +1,45 @@
+def build_node(type, name, content):
+    """
+    Wrap up content in to a html node.
+
+    :param type: content type (e.g., doc, section, text, figure)
+    :type path: str
+    :param name: content name (e.g., the name of the section)
+    :type path: str
+    :param name: actual content
+    :type path: str
+    :return: new String with content in html format
+    """
+    if type == "doc":
+        pattern = "<{}>{}</{}>"
+        return pattern.format("html", content, "html")
+    if type == "section":
+        pattern = "<{} name='{}'>{}</{}>"
+        return pattern.format("section", name, content, "section")
+    if type == "text":
+        pattern = "<{} name='{}'>{}</{}>"
+        return pattern.format("p", name, content, "p")
+    if type == "figure":
+        pattern = "<{} name='{}' src='{}'/>"
+        return pattern.format("img", name, content)
+
+
+def column_constructor(text, name=None, type="text", delim=None):
+    """
+    Converts raw content to a list of strutured tuple where each tuple contains
+        (type, name, content).
+
+    :param text: content to be converted ()
+    :type path: str
+    :param type: content name (default: None)
+    :type path: str
+    :param type: content type (default: text)
+    :type path: str
+    :param delim: delimiter to split the content
+    :type path: str
+    :return: A list of tuple where each tuple contains
+        (content type, content name, content)
+    """
+    if delim is None:
+        return [(type, name, text)]
+    return [(type, name, content) for content in text.split(delim)]