Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable RegexMatchSpan with concatenates words by sep="(separator)" option #492

Merged
merged 4 commits into from
Aug 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ Added

Changed
^^^^^^^
* `@YasushiMiyata`_: Enable `RegexMatchSpan` with concatenates words by sep="(separator)" option.
(`#270 <https://github.com/HazyResearch/fonduer/issues/270>`_)
(`#492 <https://github.com/HazyResearch/fonduer/pull/492>`_)
* `@HiromuHota`_: Enabled "Type hints (PEP 484) support for the Sphinx autodoc extension."
(`#421 <https://github.com/HazyResearch/fonduer/pull/421>`_)
* `@HiromuHota`_: Switched the Cython wrapper for Mecab from mecab-python3 to fugashi.
Expand Down
2 changes: 1 addition & 1 deletion src/fonduer/candidates/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def init(self) -> None:
raise Exception("Please supply a regular expression string r as rgx=r.")
self.ignore_case = self.opts.get("ignore_case", True)
self.attrib = self.opts.get("attrib", WORDS)
self.sep = self.opts.get("sep", " ")
self.sep = self.opts.get("sep", "")

# Extending the _RegexMatch to handle search(instead of only match)
# and adding a toggle for full span match.
Expand Down
9 changes: 5 additions & 4 deletions src/fonduer/candidates/models/implicit_span_mention.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,21 +151,22 @@ def get_attrib_tokens(self, a: str = "words") -> List:
"""
return self.__getattribute__(a)

def get_attrib_span(self, a: str, sep: str = " ") -> str:
def get_attrib_span(self, a: str, sep: str = "") -> str:
"""Get the span of sentence attribute *a*.

Intuitively, like calling::

sep.join(implicit_span.a)

:param a: The attribute to get a span for.
:param sep: The separator to use for the join.
:param sep: The separator to use for the join,
or to be removed from text if a="words".
:return: The joined tokens, or text if a="words".
"""
if a == "words":
return self.text
return self.text.replace(sep, "")
else:
return sep.join(self.get_attrib_tokens(a))
return sep.join([str(n) for n in self.get_attrib_tokens(a)])

def __getitem__(self, key: slice) -> "TemporaryImplicitSpanMention":
"""Slice operation returns a new candidate sliced according to **char index**.
Expand Down
11 changes: 7 additions & 4 deletions src/fonduer/candidates/models/span_mention.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,26 @@ def get_attrib_tokens(self, a: str = "words") -> List:
self.get_word_start_index() : self.get_word_end_index() + 1
]

def get_attrib_span(self, a: str, sep: str = " ") -> str:
def get_attrib_span(self, a: str, sep: str = "") -> str:
"""Get the span of sentence attribute *a*.

Intuitively, like calling::

sep.join(span.a)

:param a: The attribute to get a span for.
:param sep: The separator to use for the join.
:param sep: The separator to use for the join,
or to be removed from text if a="words".
:return: The joined tokens, or text if a="words".
"""
# NOTE: Special behavior for words currently (due to correspondence
# with char_offsets)
if a == "words":
return self.sentence.text[self.char_start : self.char_end + 1]
return self.sentence.text[self.char_start : self.char_end + 1].replace(
sep, ""
)
else:
return sep.join(self.get_attrib_tokens(a))
return sep.join([str(n) for n in self.get_attrib_tokens(a)])

def get_span(self) -> str:
"""Return the text of the ``Span``.
Expand Down
4 changes: 4 additions & 0 deletions tests/candidates/test_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,10 @@ def test_regex_match(doc_setup):
matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False)
assert list(matcher.apply(space.apply(doc))) == []

# Test sep option
matcher = RegexMatchSpan(rgx=r"isapple", sep=" ")
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is apple"}


def test_ner_matchers():
"""Test different ner type matchers."""
Expand Down