HazyResearch · senwu · Aug 5, 2020 · Jul 31, 2020 · Aug 1, 2020 · Aug 1, 2020
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -21,6 +21,9 @@ Added
 
 Changed
 ^^^^^^^
+* `@YasushiMiyata`_: Enable `RegexMatchSpan` with concatenates words by sep="(separator)" option.
+  (`#270 <https://github.com/HazyResearch/fonduer/issues/270>`_)
+  (`#492 <https://github.com/HazyResearch/fonduer/pull/492>`_)
 * `@HiromuHota`_: Enabled "Type hints (PEP 484) support for the Sphinx autodoc extension."
   (`#421 <https://github.com/HazyResearch/fonduer/pull/421>`_)
 * `@HiromuHota`_: Switched the Cython wrapper for Mecab from mecab-python3 to fugashi.

diff --git a/src/fonduer/candidates/matchers.py b/src/fonduer/candidates/matchers.py
@@ -294,7 +294,7 @@ def init(self) -> None:
             raise Exception("Please supply a regular expression string r as rgx=r.")
         self.ignore_case = self.opts.get("ignore_case", True)
         self.attrib = self.opts.get("attrib", WORDS)
-        self.sep = self.opts.get("sep", " ")
+        self.sep = self.opts.get("sep", "")
 
         # Extending the _RegexMatch to handle search(instead of only match)
         # and adding a toggle for full span match.

diff --git a/src/fonduer/candidates/models/implicit_span_mention.py b/src/fonduer/candidates/models/implicit_span_mention.py
@@ -151,21 +151,22 @@ def get_attrib_tokens(self, a: str = "words") -> List:
         """
         return self.__getattribute__(a)
 
-    def get_attrib_span(self, a: str, sep: str = " ") -> str:
+    def get_attrib_span(self, a: str, sep: str = "") -> str:
         """Get the span of sentence attribute *a*.
 
         Intuitively, like calling::
 
             sep.join(implicit_span.a)
 
         :param a: The attribute to get a span for.
-        :param sep: The separator to use for the join.
+        :param sep: The separator to use for the join,
+                    or to be removed from text if a="words".
         :return: The joined tokens, or text if a="words".
         """
         if a == "words":
-            return self.text
+            return self.text.replace(sep, "")
         else:
-            return sep.join(self.get_attrib_tokens(a))
+            return sep.join([str(n) for n in self.get_attrib_tokens(a)])
 
     def __getitem__(self, key: slice) -> "TemporaryImplicitSpanMention":
         """Slice operation returns a new candidate sliced according to **char index**.

diff --git a/src/fonduer/candidates/models/span_mention.py b/src/fonduer/candidates/models/span_mention.py
@@ -138,23 +138,26 @@ def get_attrib_tokens(self, a: str = "words") -> List:
             self.get_word_start_index() : self.get_word_end_index() + 1
         ]
 
-    def get_attrib_span(self, a: str, sep: str = " ") -> str:
+    def get_attrib_span(self, a: str, sep: str = "") -> str:
         """Get the span of sentence attribute *a*.
 
         Intuitively, like calling::
 
             sep.join(span.a)
 
         :param a: The attribute to get a span for.
-        :param sep: The separator to use for the join.
+        :param sep: The separator to use for the join,
+                    or to be removed from text if a="words".
         :return: The joined tokens, or text if a="words".
         """
         # NOTE: Special behavior for words currently (due to correspondence
         # with char_offsets)
         if a == "words":
-            return self.sentence.text[self.char_start : self.char_end + 1]
+            return self.sentence.text[self.char_start : self.char_end + 1].replace(
+                sep, ""
+            )
         else:
-            return sep.join(self.get_attrib_tokens(a))
+            return sep.join([str(n) for n in self.get_attrib_tokens(a)])
 
     def get_span(self) -> str:
         """Return the text of the ``Span``.

diff --git a/tests/candidates/test_matchers.py b/tests/candidates/test_matchers.py
@@ -329,6 +329,10 @@ def test_regex_match(doc_setup):
     matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False)
     assert list(matcher.apply(space.apply(doc))) == []
 
+    # Test sep option
+    matcher = RegexMatchSpan(rgx=r"isapple", sep=" ")
+    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is apple"}
+
 
 def test_ner_matchers():
     """Test different ner type matchers."""