Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ class RegexMatcher(AnnotatorTransformer):
"strategy",
"MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE",
typeConverter=TypeConverters.toString)
rules = Param(Params._dummy(),
"rules",
rulesPath = Param(Params._dummy(),
"rulesPath",
"rules file path, must be a tuple of regex and identifier. replace config with this",
typeConverter=TypeConverters.toString)

Expand All @@ -115,8 +115,8 @@ def __init__(self):
def setStrategy(self, value):
return self._set(strategy=value)

def setRules(self, value):
return self._set(rules=value)
def setRulesPath(self, value):
return self._set(rulesPath=value)


class Lemmatizer(AnnotatorTransformer):
Expand Down
1 change: 1 addition & 0 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def runTest(self):
.setOutputCol("document")
regex_matcher = RegexMatcher() \
.setStrategy("MATCH_ALL") \
.setRulesPath("../src/test/resources/regex-matcher/rules.txt") \
.setOutputCol("regex")
assembled = document_assembler.transform(self.data)
regex_matcher.transform(assembled).show()
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/com/jsl/nlp/Finisher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Finisher(override val uid: String)
def getOutputCols: Array[String] = get(outputCols).getOrElse(getInputCols.map("finished_" + _))
def getInputCols: Array[String] = $(inputCols)
def getValueSplitSymbol: String = $(valueSplitSymbol)
def getAnnotationSpltSymbol: String = $(annotationSplitSymbol)
def getAnnotationSplitSymbol: String = $(annotationSplitSymbol)
def getCleanAnnotations: Boolean = $(cleanAnnotations)
def getIncludeKeys: Boolean = $(includeKeys)

Expand Down
33 changes: 21 additions & 12 deletions src/main/scala/com/jsl/nlp/annotators/RegexMatcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,26 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
* -- MATCH_COMPLETE returns only if match is entire target.
*/
class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher] {

import com.jsl.nlp.AnnotatorType._

// ToDo: Check wether this annotator can be stored to disk as is. otherwise turn regex into string
val rules: Param[String] = new Param(this, "rules", "regex patterns to match")
lazy val defaultRules: Array[(String, String)] = ResourceHelper.retrieveRegexMatchRules()

// ToDo: Check whether this annotator can be stored to disk as is. otherwise turn regex into string

private var loadedRules: Array[(String, String)] = ResourceHelper.retrieveRegexMatchRules()
val rulesPath: Param[String] = new Param(this, "rulesPath", "File containing rules separated by commas")

val rules: Param[Array[(String, String)]] = new Param(this, "rules", "Array of rule strings separated by commas")

val strategy: Param[String] = new Param(this, "strategy", "MATCH_ALL|MATCH_FIRST|MATCH_COMPLETE")

def setRulesPath(path: String): this.type = set(rulesPath, path)

def getRulesPath: String = $(rulesPath)

def setRules(value: Array[(String, String)]): this.type = set(rules, value)

def getRules: Array[(String, String)] = $(rules)

private val matchFactory = RuleFactory.lateMatching(TransformStrategy.NO_TRANSFORM)(_)

override val annotatorType: AnnotatorType = REGEX
Expand All @@ -36,19 +46,17 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher

setDefault(inputCols, Array(DOCUMENT))

def this() = this(Identifiable.randomUID("REGEX_MATCHER"))

def getRules: Array[(String, String)] = loadedRules
setDefault(rulesPath, "__default")

def setRules(path: String): this.type = {
loadedRules = ResourceHelper.retrieveRegexMatchRules(path)
set(rules, path)
}
def this() = this(Identifiable.randomUID("REGEX_MATCHER"))

def setStrategy(value: String): this.type = set(strategy, value)

def getStrategy: String = $(strategy).toString

private def resolveRulesFromPath(): Array[(String, String)] =
ResourceHelper.retrieveRegexMatchRules($(rulesPath))

private def getFactoryStrategy: MatchStrategy = $(strategy) match {
case "MATCH_ALL" => MatchStrategy.MATCH_ALL
case "MATCH_FIRST" => MatchStrategy.MATCH_FIRST
Expand All @@ -60,7 +68,7 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
annotations.flatMap { annotation =>
matchFactory(getFactoryStrategy)
.setRules(loadedRules.map(r => new RegexRule(r._1, r._2)))
.setRules(get(rules).getOrElse(resolveRulesFromPath()).map(r => new RegexRule(r._1, r._2)))
.findMatch(annotation.metadata(AnnotatorType.DOCUMENT)).map { m =>
Annotation(
annotatorType,
Expand All @@ -72,4 +80,5 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
}
}
}

object RegexMatcher extends DefaultParamsReadable[RegexMatcher]
2 changes: 1 addition & 1 deletion src/test/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ nlp {
format = "txt"
}
regexMatcher {
file = ""
file = "/regex-matcher/rules.txt"
format = "txt"
separator = ","
}
Expand Down
2 changes: 2 additions & 0 deletions src/test/resources/regex-matcher/rules.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
the\s\w+, followed by 'the'
ceremonies, "ceremony"
3 changes: 2 additions & 1 deletion src/test/scala/com/jsl/nlp/AnnotatorBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
.transform(withFullPragmaticSentenceDetector(withTokenizer(dataset)))
}

def withRegexMatcher(dataset: Dataset[Row], rules: Array[(String, String)], strategy: String): Dataset[Row] = {
def withRegexMatcher(dataset: Dataset[Row], rules: Array[(String, String)] = Array.empty[(String, String)], strategy: String): Dataset[Row] = {
val regexMatcher = new RegexMatcher()
.setStrategy(strategy)
.setInputCols(Array("document"))
.setOutputCol("regex")
if (rules.nonEmpty) regexMatcher.setRules(rules)
regexMatcher.transform(dataset)
}

Expand Down
66 changes: 44 additions & 22 deletions src/test/scala/com/jsl/nlp/annotators/RegexMatcherBehaviors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,55 @@ import scala.language.reflectiveCalls

trait RegexMatcherBehaviors { this: FlatSpec =>
def fixture(dataset: Dataset[Row], rules: Array[(String, String)], strategy: String) = new {
val df = AnnotatorBuilder.withRegexMatcher(dataset, rules, strategy)
val regexAnnotations = df.select("regex")
val annotationDataset: Dataset[_] = AnnotatorBuilder.withRegexMatcher(dataset, rules, strategy)
val regexAnnotations: Array[Annotation] = annotationDataset.select("regex")
.collect
.flatMap { _.getSeq[Row](0) }
.map { Annotation(_) }

df.show
annotationDataset.show()
}

def predefinedRulesRegexMatcher(dataset: => Dataset[Row], rules: Array[(String, String)], strategy: String): Unit = {
// "A RegexMatcher Annotator" should s"successfuly match ${rules.map(_._1).mkString(",")}" in {
// val f = fixture(dataset, rules, strategy)
// f.regexAnnotations.foreach { a =>
// assert(a.metadata == REGEX)
// }
// }
//
// it should "create annotations" in {
// val f = fixture(dataset, rules, strategy)
// assert(f.regexAnnotations.size > 0)
// }
//
// it should "create annotations with the correct tag" in {
// val f = fixture(dataset, rules, strategy)
// f.regexAnnotations.foreach { a =>
// assert(a.annotatorType == REGEX)
// }
// }
def predefinedRulesRegexMatcher(dataset: => Dataset[Row], strategy: String): Unit = {
val rules = Array.empty[(String, String)]
"A RegexMatcher Annotator with predefined rules" should s"successfuly match" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.metadata.toArray.nonEmpty)
}
}

it should "create annotations" in {
val f = fixture(dataset, rules, strategy)
assert(f.regexAnnotations.nonEmpty)
}

it should "create annotations with the correct tag" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.annotatorType == REGEX)
}
}
}

def customizedRulesRegexMatcher(dataset: => Dataset[Row], rules: Array[(String, String)], strategy: String): Unit = {
"A RegexMatcher Annotator with custom rules" should s"successfuly match ${rules.map(_._1).mkString(",")}" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.metadata.toArray.nonEmpty)
}
}

it should "create annotations" in {
val f = fixture(dataset, rules, strategy)
assert(f.regexAnnotations.nonEmpty)
}

it should "create annotations with the correct tag" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.annotatorType == REGEX)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ class RegexMatcherTestSpec extends FlatSpec with RegexMatcherBehaviors {
("the\\s\\w+", "followed by 'the'"),
("ceremonies", "ceremony")
)
"A full RegexMatcher pipeline with content" should behave like predefinedRulesRegexMatcher(df, rules, strategy)
"A full RegexMatcher pipeline with content" should behave like customizedRulesRegexMatcher(df, rules, strategy)
it should behave like predefinedRulesRegexMatcher(df, strategy)
}