Skip to content

Commit

Permalink
support csv format input
Browse files Browse the repository at this point in the history
  • Loading branch information
senwu committed Dec 18, 2018
1 parent 19e2f81 commit a0270af
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 62 deletions.
4 changes: 2 additions & 2 deletions src/fonduer/parser/preprocessors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from fonduer.parser.preprocessors.csv_paths_preprocessor import CSVPathsPreprocessor
from fonduer.parser.preprocessors.csv_doc_preprocessor import CSVDocPreprocessor
from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor
from fonduer.parser.preprocessors.html_doc_preprocessor import HTMLDocPreprocessor
from fonduer.parser.preprocessors.text_doc_preprocessor import TextDocPreprocessor
from fonduer.parser.preprocessors.tsv_doc_preprocessor import TSVDocPreprocessor

__all__ = [
"CSVPathsPreprocessor",
"CSVDocPreprocessor",
"DocPreprocessor",
"HTMLDocPreprocessor",
"TSVDocPreprocessor",
Expand Down
99 changes: 99 additions & 0 deletions src/fonduer/parser/preprocessors/csv_doc_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import codecs
import csv
import os

from fonduer.parser.models import Document
from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor
from fonduer.utils.utils_parser import build_node, column_constructor


class CSVDocPreprocessor(DocPreprocessor):
"""A generator which processes a CSV file or directory of CSV files into
a set of Document objects. It treats each line in the input file is a document.
This ``DocPreprocessor`` assumes that each column is one section and content in
each column as one paragraph as defalt. However, if the column is complex, an
advanced parser may be used by specifying ``parser_rule`` parameter, e,g.,
specify keywords as delimiters for paragraph in a dict format where key is the
column inedx and value is the keyword list.
:param path: filesystem path to file or directory to parse.
:type path: str
:param encoding: file encoding to use (e.g. "utf-8").
:type encoding: str
:param max_docs: the maximum number of ``Documents`` to produce.
:type max_docs: int
:param header: if the CSV file contain header or not, if yes, the header
will be used as Section name. default = False
:type header: bool
:param delim: delimiter to be used to separate columns when file has
more than one column. It is active only when ``column is not
None``. default=','
:type delim: int
:param parser_rule: The parser rule to be used to parse the specific column.
default = None
:rtype: A generator of ``Documents``.
"""

def __init__(
self,
path,
encoding="utf-8",
max_docs=float("inf"),
header=False,
delim=",",
parser_rule=None,
):
super(CSVDocPreprocessor, self).__init__(path, encoding, max_docs)
self.header = header
self.delim = delim
self.parser_rule = parser_rule
self.n_parsed = 0

def _parse_file(self, fp, file_name):
name = os.path.basename(fp)[: os.path.basename(fp).rfind(".")]
with codecs.open(fp, encoding=self.encoding) as f:
reader = csv.reader(f)

# Load CSV header
header_names = None
if self.header:
header_names = next(reader)

# Load document per row
for i, row in enumerate(reader):
if self.n_parsed == self.max_docs:
break
sections = []
for j, content in enumerate(row):
rule = (
self.parser_rule[j]
if self.parser_rule is not None and j in self.parser_rule
else column_constructor
)
content_header = (
header_names[j] if header_names is not None else None
)
context = [build_node(t, n, c) for t, n, c in rule(content)]
sections.append(
build_node("section", content_header, "".join(context))
)

text = build_node("doc", None, "".join(sections))
doc_name = name + ":" + str(i)
stable_id = self._get_stable_id(doc_name)

yield Document(
name=doc_name,
stable_id=stable_id,
text=text,
meta={"file_name": file_name},
)
self.n_parsed += 1

def __len__(self):
"""Provide a len attribute based on max_docs and number of files in folder."""
num_docs = min(len(self.all_files), self.max_docs)
return num_docs

def _can_read(self, fpath):
return fpath.lower().endswith(".csv")
57 changes: 0 additions & 57 deletions src/fonduer/parser/preprocessors/csv_paths_preprocessor.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/fonduer/parser/preprocessors/doc_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ class DocPreprocessor(object):
A generator which processes a file or directory of files into a set of
Document objects.
:param encoding: file encoding to use (e.g. "utf-8").
:type encoding: str
:param path: filesystem path to file or directory to parse.
:type path: str
:param encoding: file encoding to use (e.g. "utf-8").
:type encoding: str
:param max_docs: the maximum number of ``Documents`` to produce.
:type max_docs: int
:rtype: A generator of ``Documents``.
Expand Down
2 changes: 1 addition & 1 deletion src/fonduer/utils/udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def fill_input_queue(in_queue, doc_loader, terminal_signal):
count_parsed = 0
while count_parsed < total_count:
y = out_queue.get()
# Update progress bar whenever an item has been processed
# Update progress bar whenever an item has been processed
if y == UDF.TASK_DONE:
count_parsed += 1
if self.pb is not None:
Expand Down
45 changes: 45 additions & 0 deletions src/fonduer/utils/utils_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
def build_node(type, name, content):
"""
Wrap up content in to a html node.
:param type: content type (e.g., doc, section, text, figure)
:type path: str
:param name: content name (e.g., the name of the section)
:type path: str
:param name: actual content
:type path: str
:return: new String with content in html format
"""
if type == "doc":
pattern = "<{}>{}</{}>"
return pattern.format("html", content, "html")
if type == "section":
pattern = "<{} name='{}'>{}</{}>"
return pattern.format("section", name, content, "section")
if type == "text":
pattern = "<{} name='{}'>{}</{}>"
return pattern.format("p", name, content, "p")
if type == "figure":
pattern = "<{} name='{}' src='{}'/>"
return pattern.format("img", name, content)


def column_constructor(text, name=None, type="text", delim=None):
"""
Converts raw content to a list of strutured tuple where each tuple contains
(type, name, content).
:param text: content to be converted ()
:type path: str
:param type: content name (default: None)
:type path: str
:param type: content type (default: text)
:type path: str
:param delim: delimiter to split the content
:type path: str
:return: A list of tuple where each tuple contains
(content type, content name, content)
"""
if delim is None:
return [(type, name, text)]
return [(type, name, content) for content in text.split(delim)]

0 comments on commit a0270af

Please sign in to comment.