From a43366e96e296a28cc5aad20834cfc04d351ecbf Mon Sep 17 00:00:00 2001 From: lambdaofgod Date: Thu, 11 Jan 2018 21:36:59 +0100 Subject: [PATCH 1/3] make lowercasing in Normalizer optional; add Scala tests --- .../nlp/annotators/Normalizer.scala | 15 +++++++++--- .../johnsnowlabs/nlp/AnnotatorBuilder.scala | 8 +++++++ .../nlp/annotators/NormalizerBehaviors.scala | 23 ++++++++++++++++++- .../nlp/annotators/NormalizerTestSpec.scala | 2 +- 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala index 0893de13e15745..ea9b743406f0f8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel} -import org.apache.spark.ml.param.Param +import org.apache.spark.ml.param.{BooleanParam, Param} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} /** @@ -17,20 +17,29 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] { override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN) val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space") + val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase") setDefault(pattern, "[^a-zA-Z]") + setDefault(lowercase, true) def getPattern: String = $(pattern) def setPattern(value: String): this.type = set(pattern, value) + def getLowercase: Boolean = $(lowercase) + + def setLowercase(value: Boolean): this.type = set(lowercase, value) + def this() = this(Identifiable.randomUID("NORMALIZER")) /** ToDo: Review implementation, Current implementation generates spaces between non-words, potentially breaking tokens*/ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations.map { token => - val nToken = token.result - .toLowerCase + val cased = + if ($(lowercase)) token.result.toLowerCase + else token.result + + val nToken = cased .replaceAll($(pattern), "") .trim Annotation( diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index 18a5c7a61e9a80..0e67f45913aa83 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -46,6 +46,14 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => normalizer.transform(withTokenizer(dataset)) } + def withCaseSensitiveNormalizer(dataset: Dataset[Row]): Dataset[Row] = { + val normalizer = new Normalizer() + .setInputCols(Array("token")) + .setOutputCol("normalized") + .setLowercase(false) + normalizer.transform(withTokenizer(dataset)) + } + def withFullLemmatizer(dataset: Dataset[Row]): Dataset[Row] = { val lemmatizer = new Lemmatizer() .setInputCols(Array("token")) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala index 01d8485e5474b6..0572a63009172e 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala @@ -11,7 +11,7 @@ trait NormalizerBehaviors { this: FlatSpec => AnnotatorBuilder.withFullNormalizer(dataset) .collect().foreach { row => - row.getSeq[Row](3) + row.getSeq[Row](4) .map(Annotation(_)) .foreach { case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN => @@ -21,4 +21,25 @@ trait NormalizerBehaviors { this: FlatSpec => } } } + + def lowercasingNormalizerPipeline(dataset: => Dataset[Row]) { + "A case-sensitive Normalizer Annotator" should "successfully transform data" in { + AnnotatorBuilder.withCaseSensitiveNormalizer(dataset) + .collect().foreach { + row => + val tokens = row.getSeq[Row](3).map(Annotation(_)) + val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_)) + normalizedAnnotations.foreach { + case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN => + assert(stem.result.nonEmpty, "Annotation result exists") + case _ => + } + + normalizedAnnotations.zip(tokens).foreach { + case (stem: Annotation, token: Annotation) => + assert(stem.result == token.result.replaceAll("[^a-zA-Z]", "")) + } + } + } + } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala index 6e5b4e025ec988..98307a4880b4cb 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerTestSpec.scala @@ -17,5 +17,5 @@ class NormalizerTestSpec extends FlatSpec with NormalizerBehaviors { val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody) "A full Normalizer pipeline with latin content" should behave like fullNormalizerPipeline(latinBodyData) - + "A Normalizer pipeline with latin content and disabled lowercasing" should behave like lowercasingNormalizerPipeline(latinBodyData) } From 969adb69e4a24a61f2243c8022ba0da96e048f23 Mon Sep 17 00:00:00 2001 From: lambdaofgod Date: Thu, 11 Jan 2018 23:35:21 +0100 Subject: [PATCH 2/3] changed docs --- docs/components.html | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/components.html b/docs/components.html index f6922074a17ce3..d191942d012412 100644 --- a/docs/components.html +++ b/docs/components.html @@ -243,6 +243,7 @@

3. Normalizer: Text cleaning

Example:
From fa98d36f64cd145d4c32627c6de335685fb258c6 Mon Sep 17 00:00:00 2001 From: lambdaofgod Date: Fri, 12 Jan 2018 21:29:48 +0100 Subject: [PATCH 3/3] Added parameter in Python; NormalizerTestSpec --- python/sparknlp/annotator.py | 6 ++++++ python/test/annotators.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index c4421c004af396..1548d83b0499db 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -115,6 +115,10 @@ class Normalizer(AnnotatorTransformer): "normalization regex pattern which match will be replaced with a space", typeConverter=TypeConverters.toString) + lowercase = Param(Params._dummy(), + "lowercase", + "whether to convert strings to lowercase") + @keyword_only def __init__(self): super(Normalizer, self).__init__() @@ -123,6 +127,8 @@ def __init__(self): def setPattern(self, value): return self._set(pattern=value) + def setLowercase(self, value): + return self._set(lowercase=value) class RegexMatcher(AnnotatorTransformer): diff --git a/python/test/annotators.py b/python/test/annotators.py index df9b5f88d30603..e42745ffb7438b 100644 --- a/python/test/annotators.py +++ b/python/test/annotators.py @@ -76,6 +76,26 @@ def runTest(self): lemmatizer.transform(tokenized).show() +class NormalizerTestSpec(unittest.TestCase): + + def setUp(self): + self.data = SparkContextForTest.data + + def runTest(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + tokenizer = RegexTokenizer() \ + .setOutputCol("token") + lemmatizer = Normalizer() \ + .setInputCols(["token"]) \ + .setOutputCol("normalized_token") \ + .setLowercase(False) + assembled = document_assembler.transform(self.data) + tokenized = tokenizer.transform(assembled) + lemmatizer.transform(tokenized).show() + + class DateMatcherTestSpec(unittest.TestCase): def setUp(self):