Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/components.html
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ <h4 id="Normalizer" class="section-block">3. Normalizer: Text cleaning</h4>
<ul>
<li>
setPattern(pattern): Regular expression for normalization, defaults [^A-Za-z]
setLowercase(value): lowercase tokens, default true
</li>
</ul>
<b>Example:</b><br>
Expand Down
6 changes: 6 additions & 0 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ class Normalizer(AnnotatorTransformer):
"normalization regex pattern which match will be replaced with a space",
typeConverter=TypeConverters.toString)

lowercase = Param(Params._dummy(),
"lowercase",
"whether to convert strings to lowercase")

@keyword_only
def __init__(self):
super(Normalizer, self).__init__()
Expand All @@ -123,6 +127,8 @@ def __init__(self):
def setPattern(self, value):
return self._set(pattern=value)

def setLowercase(self, value):
return self._set(lowercase=value)

class RegexMatcher(AnnotatorTransformer):

Expand Down
20 changes: 20 additions & 0 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,26 @@ def runTest(self):
lemmatizer.transform(tokenized).show()


class NormalizerTestSpec(unittest.TestCase):

def setUp(self):
self.data = SparkContextForTest.data

def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = RegexTokenizer() \
.setOutputCol("token")
lemmatizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized_token") \
.setLowercase(False)
assembled = document_assembler.transform(self.data)
tokenized = tokenizer.transform(assembled)
lemmatizer.transform(tokenized).show()


class DateMatcherTestSpec(unittest.TestCase):

def setUp(self):
Expand Down
15 changes: 12 additions & 3 deletions src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.param.{BooleanParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

/**
Expand All @@ -17,20 +17,29 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space")
val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")

setDefault(pattern, "[^a-zA-Z]")
setDefault(lowercase, true)

def getPattern: String = $(pattern)

def setPattern(value: String): this.type = set(pattern, value)

def getLowercase: Boolean = $(lowercase)

def setLowercase(value: Boolean): this.type = set(lowercase, value)

def this() = this(Identifiable.randomUID("NORMALIZER"))

/** ToDo: Review implementation, Current implementation generates spaces between non-words, potentially breaking tokens*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.map { token =>
val nToken = token.result
.toLowerCase
val cased =
if ($(lowercase)) token.result.toLowerCase
else token.result

val nToken = cased
.replaceAll($(pattern), "")
.trim
Annotation(
Expand Down
8 changes: 8 additions & 0 deletions src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
normalizer.transform(withTokenizer(dataset))
}

def withCaseSensitiveNormalizer(dataset: Dataset[Row]): Dataset[Row] = {
val normalizer = new Normalizer()
.setInputCols(Array("token"))
.setOutputCol("normalized")
.setLowercase(false)
normalizer.transform(withTokenizer(dataset))
}

def withFullLemmatizer(dataset: Dataset[Row]): Dataset[Row] = {
val lemmatizer = new Lemmatizer()
.setInputCols(Array("token"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ trait NormalizerBehaviors { this: FlatSpec =>
AnnotatorBuilder.withFullNormalizer(dataset)
.collect().foreach {
row =>
row.getSeq[Row](3)
row.getSeq[Row](4)
.map(Annotation(_))
.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
Expand All @@ -21,4 +21,25 @@ trait NormalizerBehaviors { this: FlatSpec =>
}
}
}

def lowercasingNormalizerPipeline(dataset: => Dataset[Row]) {
"A case-sensitive Normalizer Annotator" should "successfully transform data" in {
AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
.collect().foreach {
row =>
val tokens = row.getSeq[Row](3).map(Annotation(_))
val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
normalizedAnnotations.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
assert(stem.result.nonEmpty, "Annotation result exists")
case _ =>
}

normalizedAnnotations.zip(tokens).foreach {
case (stem: Annotation, token: Annotation) =>
assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ class NormalizerTestSpec extends FlatSpec with NormalizerBehaviors {
val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)

"A full Normalizer pipeline with latin content" should behave like fullNormalizerPipeline(latinBodyData)

"A Normalizer pipeline with latin content and disabled lowercasing" should behave like lowercasingNormalizerPipeline(latinBodyData)
}