diff --git a/docs/components.html b/docs/components.html
index f6922074a17ce3..d191942d012412 100644
--- a/docs/components.html
+++ b/docs/components.html
@@ -243,6 +243,7 @@
3. Normalizer: Text cleaning
-
setPattern(pattern): Regular expression for normalization, defaults [^A-Za-z]
+ setLowercase(value): lowercase tokens, default true
Example:
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index c4421c004af396..1548d83b0499db 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -115,6 +115,10 @@ class Normalizer(AnnotatorTransformer):
"normalization regex pattern which match will be replaced with a space",
typeConverter=TypeConverters.toString)
+ lowercase = Param(Params._dummy(),
+ "lowercase",
+ "whether to convert strings to lowercase")
+
@keyword_only
def __init__(self):
super(Normalizer, self).__init__()
@@ -123,6 +127,8 @@ def __init__(self):
def setPattern(self, value):
return self._set(pattern=value)
+ def setLowercase(self, value):
+ return self._set(lowercase=value)
class RegexMatcher(AnnotatorTransformer):
diff --git a/python/test/annotators.py b/python/test/annotators.py
index df9b5f88d30603..e42745ffb7438b 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -76,6 +76,26 @@ def runTest(self):
lemmatizer.transform(tokenized).show()
+class NormalizerTestSpec(unittest.TestCase):
+
+ def setUp(self):
+ self.data = SparkContextForTest.data
+
+ def runTest(self):
+ document_assembler = DocumentAssembler() \
+ .setInputCol("text") \
+ .setOutputCol("document")
+ tokenizer = RegexTokenizer() \
+ .setOutputCol("token")
+ lemmatizer = Normalizer() \
+ .setInputCols(["token"]) \
+ .setOutputCol("normalized_token") \
+ .setLowercase(False)
+ assembled = document_assembler.transform(self.data)
+ tokenized = tokenizer.transform(assembled)
+ lemmatizer.transform(tokenized).show()
+
+
class DateMatcherTestSpec(unittest.TestCase):
def setUp(self):
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
index 0893de13e15745..ea9b743406f0f8 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
@@ -1,7 +1,7 @@
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
-import org.apache.spark.ml.param.Param
+import org.apache.spark.ml.param.{BooleanParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
/**
@@ -17,20 +17,29 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)
val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space")
+ val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")
setDefault(pattern, "[^a-zA-Z]")
+ setDefault(lowercase, true)
def getPattern: String = $(pattern)
def setPattern(value: String): this.type = set(pattern, value)
+ def getLowercase: Boolean = $(lowercase)
+
+ def setLowercase(value: Boolean): this.type = set(lowercase, value)
+
def this() = this(Identifiable.randomUID("NORMALIZER"))
/** ToDo: Review implementation, Current implementation generates spaces between non-words, potentially breaking tokens*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.map { token =>
- val nToken = token.result
- .toLowerCase
+ val cased =
+ if ($(lowercase)) token.result.toLowerCase
+ else token.result
+
+ val nToken = cased
.replaceAll($(pattern), "")
.trim
Annotation(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
index 18a5c7a61e9a80..0e67f45913aa83 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
@@ -46,6 +46,14 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
normalizer.transform(withTokenizer(dataset))
}
+ def withCaseSensitiveNormalizer(dataset: Dataset[Row]): Dataset[Row] = {
+ val normalizer = new Normalizer()
+ .setInputCols(Array("token"))
+ .setOutputCol("normalized")
+ .setLowercase(false)
+ normalizer.transform(withTokenizer(dataset))
+ }
+
def withFullLemmatizer(dataset: Dataset[Row]): Dataset[Row] = {
val lemmatizer = new Lemmatizer()
.setInputCols(Array("token"))
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
index 01d8485e5474b6..0572a63009172e 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
@@ -11,7 +11,7 @@ trait NormalizerBehaviors { this: FlatSpec =>
AnnotatorBuilder.withFullNormalizer(dataset)
.collect().foreach {
row =>
- row.getSeq[Row](3)
+ row.getSeq[Row](4)
.map(Annotation(_))
.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
@@ -21,4 +21,25 @@ trait NormalizerBehaviors { this: FlatSpec =>
}
}
}
+
+ def lowercasingNormalizerPipeline(dataset: => Dataset[Row]) {
+ "A case-sensitive Normalizer Annotator" should "successfully transform data" in {
+ AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
+ .collect().foreach {
+ row =>
+ val tokens = row.getSeq[Row](3).map(Annotation(_))
+ val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
+ normalizedAnnotations.foreach {
+ case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
+ assert(stem.result.nonEmpty, "Annotation result exists")
+ case _ =>
+ }
+
+ normalizedAnnotations.zip(tokens).foreach {
+ case (stem: Annotation, token: Annotation) =>
+ assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
+ }
+ }
+ }
+ }
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala
index 6e5b4e025ec988..98307a4880b4cb 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala
@@ -17,5 +17,5 @@ class NormalizerTestSpec extends FlatSpec with NormalizerBehaviors {
val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)
"A full Normalizer pipeline with latin content" should behave like fullNormalizerPipeline(latinBodyData)
-
+ "A Normalizer pipeline with latin content and disabled lowercasing" should behave like lowercasingNormalizerPipeline(latinBodyData)
}