From db89cbb91c6de1cd6842dc82e79bbe405ff8c937 Mon Sep 17 00:00:00 2001
From: Saif Addin 1. DocumentAssembler: Getting
-
2. RegexTokenizer: Word tokens
+ 2. Tokenizer: Word tokens
tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setInputCols(["sentences"]) \
.setOutputCol("token")
@@ -218,7 +218,7 @@ 2. RegexTokenizer: Word tokens
Example:
- val regexTokenizer = new RegexTokenizer()
+ val regexTokenizer = new Tokenizer()
.setInputCols("sentence")
.setOutputCol("token")
@@ -653,7 +653,7 @@ 9. SentenceDetector: Sentence B
Example:
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence") \
.setUseAbbreviations(True)
@@ -673,7 +673,7 @@ 9. SentenceDetector: Sentence B
Example:
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setInputCols("document")
.setOutputCol("sentence")
@@ -790,7 +790,7 @@ 11. SentimentDetector: Sentime
Example:
- sentiment_detector = SentimentDetectorModel() \
+ sentiment_detector = SentimentDetector() \
.setInputCols(["lemma", "sentence"]) \
.setOutputCol("sentiment")
@@ -825,7 +825,7 @@ 11. SentimentDetector: Sentime
Example:
- val sentimentDetector = new SentimentDetectorModel
+ val sentimentDetector = new SentimentDetector
.setInputCols(Array("token", "sentence"))
.setOutputCol("sentiment")
@@ -902,7 +902,7 @@ 13. SpellChecker: Token spell
Inputs: Any text for corpus. A list of words for dictionary. A
comma
separated custom dictionary.
- Requires: RegexTokenizer
+ Requires: Tokenizer
Functions:
-
@@ -947,7 +947,7 @@
13. SpellChecker: Token spell
Inputs: Any text for corpus. A list of words for dictionary. A
comma
separated custom dictionary.
- Requires: RegexTokenizer
+ Requires: Tokenizer
Functions:
-
@@ -1017,7 +1017,7 @@
14. ViveknSentimentDetec
Input: File or folder of text files of positive and negative data
Example:
- sentiment_detector = SentimentDetectorModel() \
+ sentiment_detector = SentimentDetector() \
.setInputCols(["lemma", "sentence"]) \
.setOutputCol("sentiment")
@@ -1225,7 +1225,7 @@ 16. TokenAssembler: Getting data
Annotators
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-val sentenceDetector = new SentenceDetectorModel()
+ import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")
-val regexTokenizer = new RegexTokenizer()
+val regexTokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb
index a29bbbbb5b301c..7bf620236fa9ff 100644
--- a/python/example/crf-ner/ner.ipynb
+++ b/python/example/crf-ner/ner.ipynb
@@ -101,11 +101,11 @@
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
- "sentenceDetector = SentenceDetectorModel()\\\n",
+ "sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- "tokenizer = RegexTokenizer()\\\n",
+ "tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
diff --git a/python/example/crf-ner/ner_benchmark.ipynb b/python/example/crf-ner/ner_benchmark.ipynb
index b63c636ebe4833..4877ef0db82e5f 100644
--- a/python/example/crf-ner/ner_benchmark.ipynb
+++ b/python/example/crf-ner/ner_benchmark.ipynb
@@ -182,11 +182,11 @@
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
- " sentenceDetector = SentenceDetectorModel()\\\n",
+ " sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- " tokenizer = RegexTokenizer()\\\n",
+ " tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
diff --git a/python/example/dictionary-sentiment/sentiment.ipynb b/python/example/dictionary-sentiment/sentiment.ipynb
index 81264b96d533b6..0726efe0ae50f3 100644
--- a/python/example/dictionary-sentiment/sentiment.ipynb
+++ b/python/example/dictionary-sentiment/sentiment.ipynb
@@ -45,11 +45,11 @@
"document_assembler = DocumentAssembler() \\\n",
" .setInputCol(\"text\")\n",
"\n",
- "sentence_detector = SentenceDetectorModel() \\\n",
+ "sentence_detector = SentenceDetector() \\\n",
" .setInputCols([\"document\"]) \\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- "tokenizer = RegexTokenizer() \\\n",
+ "tokenizer = Tokenizer() \\\n",
" .setInputCols([\"sentence\"]) \\\n",
" .setOutputCol(\"token\")\n",
"\n",
@@ -58,7 +58,7 @@
" .setOutputCol(\"lemma\") \\\n",
" .setDictionary(\"../../../src/test/resources/lemma-corpus/AntBNC_lemmas_ver_001.txt\")\n",
" \n",
- "sentiment_detector = SentimentDetectorModel() \\\n",
+ "sentiment_detector = SentimentDetector() \\\n",
" .setInputCols([\"lemma\", \"sentence\"]) \\\n",
" .setOutputCol(\"sentiment_score\") \\\n",
" .setDictPath(\"../../../src/test/resources/sentiment-corpus/default-sentiment-dict.txt\")\n",
diff --git a/python/example/entities-extractor/extractor.ipynb b/python/example/entities-extractor/extractor.ipynb
index 625d4523df7f41..a0d692f3bd2c1f 100644
--- a/python/example/entities-extractor/extractor.ipynb
+++ b/python/example/entities-extractor/extractor.ipynb
@@ -51,11 +51,11 @@
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
- "sentenceDetector = SentenceDetectorModel()\\\n",
+ "sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- "tokenizer = RegexTokenizer()\\\n",
+ "tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
diff --git a/python/example/vivekn-sentiment/sentiment.ipynb b/python/example/vivekn-sentiment/sentiment.ipynb
index f453217ad1e14c..bf11f116846402 100644
--- a/python/example/vivekn-sentiment/sentiment.ipynb
+++ b/python/example/vivekn-sentiment/sentiment.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"collapsed": true
},
@@ -20,9 +20,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------+---------+--------------------+\n",
+ "|itemid|sentiment| text|\n",
+ "+------+---------+--------------------+\n",
+ "| 1| 0| ...|\n",
+ "| 2| 0| ...|\n",
+ "| 3| 1| omg...|\n",
+ "| 4| 0| .. Omga...|\n",
+ "| 5| 0| i think ...|\n",
+ "| 6| 0| or i jus...|\n",
+ "| 7| 1| Juuuuuuuuu...|\n",
+ "| 8| 0| Sunny Agai...|\n",
+ "| 9| 1| handed in m...|\n",
+ "| 10| 1| hmmmm.... i...|\n",
+ "| 11| 0| I must thin...|\n",
+ "| 12| 1| thanks to a...|\n",
+ "| 13| 0| this weeken...|\n",
+ "| 14| 0| jb isnt show...|\n",
+ "| 15| 0| ok thats it ...|\n",
+ "| 16| 0| <-------- ...|\n",
+ "| 17| 0| awhhe man.......|\n",
+ "| 18| 1| Feeling stran...|\n",
+ "| 19| 0| HUGE roll of ...|\n",
+ "| 20| 0| I just cut my...|\n",
+ "+------+---------+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"#Load the input data to be annotated\n",
"data = spark. \\\n",
@@ -36,7 +69,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"collapsed": true
},
@@ -59,7 +92,7 @@
"outputs": [],
"source": [
"### Sentence detector\n",
- "sentence_detector = SentenceDetectorModel() \\\n",
+ "sentence_detector = SentenceDetector() \\\n",
" .setInputCols([\"document\"]) \\\n",
" .setOutputCol(\"sentence\")\n",
"#sentence_data = sentence_detector.transform(checked)"
@@ -74,7 +107,7 @@
"outputs": [],
"source": [
"### Tokenizer\n",
- "tokenizer = RegexTokenizer() \\\n",
+ "tokenizer = Tokenizer() \\\n",
" .setInputCols([\"sentence\"]) \\\n",
" .setOutputCol(\"token\")\n",
"#tokenized = tokenizer.transform(assembled)"
@@ -154,7 +187,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"pipeline = Pipeline(stages=[\n",
@@ -178,7 +213,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"for r in sentiment_data.take(5):\n",
@@ -188,7 +225,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"start = time.time()\n",
@@ -201,7 +240,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"start = time.time()\n",
@@ -214,7 +255,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"start = time.time()\n",
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index 63e8cf208260d1..542e2d6a52f63c 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -333,7 +333,7 @@ def __init__(self):
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector", self.uid)
-class SentimentDetectorModel(AnnotatorTransformer):
+class SentimentDetector(AnnotatorTransformer):
dictPath = Param(Params._dummy(),
"dictPath",
"path for dictionary to sentiment analysis")
@@ -348,8 +348,8 @@ class SentimentDetectorModel(AnnotatorTransformer):
@keyword_only
def __init__(self):
- super(SentimentDetectorModel, self).__init__()
- self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel", self.uid)
+ super(SentimentDetector, self).__init__()
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector", self.uid)
def setDictPath(self, value):
return self._set(dictPath=value)
diff --git a/python/test/annotators.py b/python/test/annotators.py
index e42745ffb7438b..b0ec7d74d1b193 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -16,7 +16,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer()\
+ tokenizer = Tokenizer()\
.setOutputCol("token")
stemmer = Stemmer() \
.setInputCols(["token"]) \
@@ -65,7 +65,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
@@ -85,7 +85,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
lemmatizer = Normalizer() \
.setInputCols(["token"]) \
@@ -121,7 +121,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
entity_extractor = EntityExtractor() \
.setOutputCol("entity")
@@ -139,10 +139,10 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
pos_tagger = PerceptronApproach() \
@@ -166,7 +166,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence") \
.setCustomBounds(["%%"])
@@ -183,17 +183,17 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
.setOutputCol("lemma") \
.setDictionary({"missed": "miss"})
- sentiment_detector = SentimentDetectorModel() \
+ sentiment_detector = SentimentDetector() \
.setInputCols(["lemma", "sentence"]) \
.setOutputCol("sentiment") \
.setDictPath("../src/test/resources/sentiment-corpus/default-sentiment-dict.txt")
@@ -213,7 +213,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
@@ -248,7 +248,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
spell_checker = NorvigSweetingApproach() \
.setInputCols(["token"]) \
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index 610dd43d48ecb4..f08bdc2ab5f75f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -24,7 +24,7 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
override val annotatorType: AnnotatorType = TOKEN
- /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */
+ /** A Tokenizer could require only for now a SentenceDetector annotator */
override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
@@ -56,7 +56,7 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
setDefault(wordPattern, "\\w+")
setDefault(extensionPattern, Array("\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*"))
setDefault(prefixPattern, Array("([^\\s\\w]?)"))
- setDefault(suffixPattern, Array("([^\\s\\w]?)"))
+ setDefault(suffixPattern, Array("([^\\s\\w]?)([^\\s\\w]*)"))
val ruleFactory = new RuleFactory(MatchStrategy.MATCH_ALL)
@@ -64,7 +64,8 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
/** Clears out rules and constructs a new rule for every combination of rules provided */
/** The strategy is to catch one token per regex group */
/** User may add its own groups if needs targets to be tokenized separately from the rest */
- /** "([^\\s\\w]?)(\\w+(?:\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*)?)([^\\s\\w]?)" */
+ /** "([^\s\w]?)(\w+(?:\.(?:\w{1}\.)+|(?:\-\w+)*)?)([^\s\w]?)([\s\w]*)" */
+ /** */
ruleFactory
.clearRules()
$(prefixPattern).foreach(pp => $(suffixPattern).foreach (sp => $(extensionPattern).foreach(ep => {
@@ -81,7 +82,6 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
(1 to m.content.groupCount)
.map (i => IndexedToken(m.content.group(i), text.begin + m.content.start, text.begin + m.content.end - 1))
}.filter(t => t.token.nonEmpty).toArray
- tokens.foreach(t => println(t.token))
TokenizedSentence(tokens)
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
similarity index 80%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
index 4915ffa57e018e..bdb47d64f953a3 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
@@ -7,10 +7,10 @@ import org.scalatest._
import scala.language.reflectiveCalls
-trait RegexTokenizerBehaviors { this: FlatSpec =>
+trait TokenizerBehaviors { this: FlatSpec =>
def fixture(dataset: => Dataset[Row]) = new {
- val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withFullPragmaticSentenceDetector(dataset))
+ val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withTokenizer(dataset))
val documents = df.select("document")
val sentences = df.select("sentence")
val tokens = df.select("token")
@@ -34,16 +34,16 @@ trait RegexTokenizerBehaviors { this: FlatSpec =>
}
def fullTokenizerPipeline(dataset: => Dataset[Row]) {
- "A RegexTokenizer Annotator" should "successfully transform data" in {
+ "A Tokenizer Annotator" should "successfully transform data" in {
val f = fixture(dataset)
- assert(f.tokensAnnotations.nonEmpty, "RegexTokenizer should add annotators")
+ assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
}
it should "annotate using the annotatorType of token" in {
val f = fixture(dataset)
- assert(f.tokensAnnotations.nonEmpty, "RegexTokenizer should add annotators")
+ assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
f.tokensAnnotations.foreach { a =>
- assert(a.annotatorType == AnnotatorType.TOKEN, "RegexTokenizer annotations type should be equal to 'token'")
+ assert(a.annotatorType == AnnotatorType.TOKEN, "Tokenizer annotations type should be equal to 'token'")
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 10b7786142e756..3f692ee79517d8 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -11,7 +11,7 @@ import org.apache.spark.ml.Pipeline
/**
* Created by saif on 02/05/17.
*/
-class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
+class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
val regexTokenizer = new Tokenizer
@@ -22,17 +22,42 @@ class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
"a Tokenizer" should "correctly tokenize target text on its defaults parameters" in {
val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
import data.sparkSession.implicits._
- val tokenizer = new Tokenizer().setInputCols("text").setOutputCol("token")
- val sentence = new SentenceDetector().setInputCols("token").setOutputCol("sentence")
- val finisher = new Finisher().setInputCols("sentence")//.setOutputAsArray(true)
- val pipeline = new Pipeline().setStages(Array(tokenizer, sentence, finisher))
- pipeline.fit(data).transform(data).select("finished_sentence").show
- assert(pipeline.fit(data).transform(data).select("output").as[Array[String]]
- .collect
- .sameElements(Array(
- "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
- "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crasus", ".")
- ))
+ val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+ val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token")
+ val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+ val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
+ val expected = Array(
+ "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+ "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
+ )
+ val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+ .collect.flatten
+ assert(
+ result.sameElements(expected),
+ s"because result tokens differ: " +
+ s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
+ )
+ }
+
+ "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters" in {
+ val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
+ import data.sparkSession.implicits._
+ val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+ val sentence = new SentenceDetector().setInputCols("document").setOutputCol("sentence")
+ val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token")
+ val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+ val pipeline = new Pipeline().setStages(Array(document, sentence, tokenizer, finisher))
+ val expected = Array(
+ "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+ "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
+ )
+ val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+ .collect.flatten
+ assert(
+ result.sameElements(expected),
+ s"because result tokens differ: " +
+ s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
+ )
}
"a spark based tokenizer" should "resolve big data" in {
@@ -55,6 +80,6 @@ class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)
- "A full RegexTokenizer pipeline with latin content" should behave like fullTokenizerPipeline(latinBodyData)
+ "A full Tokenizer pipeline with latin content" should behave like fullTokenizerPipeline(latinBodyData)
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
index 8aca483f910e7a..b1f053161ab7a3 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
@@ -8,7 +8,7 @@ class SentenceDetectorBoundsSpec extends FlatSpec {
val model = new PragmaticMethod(false)
- "SentenceDetectorModel" should "return correct sentence bounds" in {
+ "SentenceDetector" should "return correct sentence bounds" in {
val bounds = model.extractBounds("Hello World!! New Sentence", Array.empty[String])
assert(bounds.length == 2)
@@ -16,7 +16,7 @@ class SentenceDetectorBoundsSpec extends FlatSpec {
assert(bounds(1) == Sentence("New Sentence", 14, 25))
}
- "SentenceDetectorModel" should "correct return sentence bounds with whitespaces" in {
+ "SentenceDetector" should "correct return sentence bounds with whitespaces" in {
val bounds = model.extractBounds(" Hello World!! . New Sentence ", Array.empty[String])
assert(bounds.length == 3)
From 5311bec2744e8ad535737436304e234553fbb080 Mon Sep 17 00:00:00 2001
From: Saif Addin
Date: Sat, 27 Jan 2018 03:03:58 -0300
Subject: [PATCH 4/9] - New tokenizer wrap up
---
python/sparknlp/annotator.py | 42 ++++-
python/test/annotators.py | 6 +-
.../com/johnsnowlabs/nlp/Annotation.scala | 4 +-
.../com/johnsnowlabs/nlp/TokenAssembler.scala | 2 +-
.../nlp/annotators/Lemmatizer.scala | 2 +-
.../nlp/annotators/Normalizer.scala | 2 +-
.../johnsnowlabs/nlp/annotators/Stemmer.scala | 2 +-
.../nlp/annotators/Tokenizer.scala | 145 +++++++++++++-----
.../annotators/common/DependencyParsed.scala | 2 +-
.../nlp/annotators/common/SentenceSplit.scala | 6 +-
.../nlp/annotators/common/Tagged.scala | 8 +-
.../nlp/annotators/common/Tokenized.scala | 4 +-
.../pragmatic/PragmaticContentFormatter.scala | 2 +-
.../spell/norvig/NorvigSweetingModel.scala | 2 +-
.../nlp/util/io/ResourceHelper.scala | 2 +-
.../nlp/util/regex/RuleFactory.scala | 2 +-
.../nlp/AnnotatorBaseTestSpec.scala | 4 +-
.../nlp/DocumentAssemblerTestSpec.scala | 2 +-
.../nlp/annotators/NormalizerBehaviors.scala | 11 +-
.../nlp/annotators/TokenizerBehaviors.scala | 4 +-
.../nlp/annotators/TokenizerTestSpec.scala | 44 +++---
.../ner/crf/NerCrfApproachSpec.scala | 2 +-
.../dep/DependencyParserApproachTest.scala | 2 +-
.../GreedyTransitionApproachTest.scala | 6 +-
.../PerceptronApproachTestSpec.scala | 1 -
25 files changed, 208 insertions(+), 101 deletions(-)
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index 542e2d6a52f63c..8c1050f10f6989 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -85,18 +85,50 @@ def __init__(self):
class Tokenizer(AnnotatorTransformer):
- pattern = Param(Params._dummy(),
- "pattern",
- "regular expression pattern for tokenization",
+ targetPattern = Param(Params._dummy(),
+ "targetPattern",
+ "pattern to grab from text as token candidates. Defaults \S+",
typeConverter=TypeConverters.toString)
+ prefixPattern = Param(Params._dummy(),
+ "prefixPattern",
+ "regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
+ typeConverter=TypeConverters.toString)
+
+ suffixPatern = Param(Params._dummy(),
+ "suffixPatern",
+ "regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
+ typeConverter=TypeConverters.toString)
+
+ compositeTokens = Param(Params._dummy(),
+ "compositeTokens",
+ "Words that won't be split in two",
+ typeConverter=TypeConverters.toListString)
+
+ infixPatterns = Param(Params._dummy(),
+ "infixPatterns",
+ "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
+ typeConverter=TypeConverters.toListString)
+
@keyword_only
def __init__(self):
super(Tokenizer, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.Tokenizer", self.uid)
- def setPattern(self, value):
- return self._set(pattern=value)
+ def setTargetPattern(self, value):
+ return self._set(targetPattern=value)
+
+ def setPrefixPattern(self, value):
+ return self._set(prefixPattern=value)
+
+ def setSuffixPattern(self, value):
+ return self._set(suffixPattern=value)
+
+ def setCompositeTokens(self, value):
+ return self._set(compositeTokens=value)
+
+ def setInfixPatterns(self, value):
+ return self._set(infixPatterns=value)
class Stemmer(AnnotatorTransformer):
diff --git a/python/test/annotators.py b/python/test/annotators.py
index b0ec7d74d1b193..21051f72d5dd25 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -17,7 +17,8 @@ def runTest(self):
.setInputCol("text") \
.setOutputCol("document")
tokenizer = Tokenizer()\
- .setOutputCol("token")
+ .setOutputCol("token") \
+ .setCompositeTokens(["New York"])
stemmer = Stemmer() \
.setInputCols(["token"]) \
.setOutputCol("stem")
@@ -29,7 +30,8 @@ def runTest(self):
.setOutputCol("assembled")
finisher = Finisher() \
.setInputCols(["assembled"]) \
- .setOutputCols(["reassembled_view"])
+ .setOutputCols(["reassembled_view"]) \
+ .setCleanAnnotations(True)
assembled = document_assembler.transform(self.data)
tokenized = tokenizer.transform(assembled)
stemmed = stemmer.transform(tokenized)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
index d233cd09853292..f5b7a470762e8b 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
@@ -10,11 +10,11 @@ import scala.collection.Map
/**
* represents annotator's output parts and their details
* @param annotatorType the type of annotation
- * @param begin the index of the first character under this annotation
+ * @param start the index of the first character under this annotation
* @param end the index after the last character under this annotation
* @param metadata associated metadata for this annotation
*/
-case class Annotation(annotatorType: String, begin: Int, end: Int, result: String, metadata: Map[String, String])
+case class Annotation(annotatorType: String, start: Int, end: Int, result: String, metadata: Map[String, String])
object Annotation {
diff --git a/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
index a8f9a4ed949f0a..f6011b31ad477f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
@@ -21,7 +21,7 @@ class TokenAssembler(override val uid: String) extends AnnotatorModel[TokenAssem
.map{case (_, sentenceAnnotations) =>
Annotation(
DOCUMENT,
- sentenceAnnotations.minBy(_.begin).begin,
+ sentenceAnnotations.minBy(_.start).start,
sentenceAnnotations.maxBy(_.end).end,
sentenceAnnotations.map(_.result).mkString(" "),
Map.empty[String, String]
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
index 0c1863b0d33fa6..1dd0739bfff007 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
@@ -78,7 +78,7 @@ class Lemmatizer(override val uid: String) extends AnnotatorModel[Lemmatizer] {
val token = tokenAnnotation.result
Annotation(
annotatorType,
- tokenAnnotation.begin,
+ tokenAnnotation.start,
tokenAnnotation.end,
$$(lemmaDict).getOrElse(token, token),
tokenAnnotation.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
index bdca08c85a32c8..ca83ae0f6c3d8c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
@@ -44,7 +44,7 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
.trim
Annotation(
annotatorType,
- token.begin,
+ token.start,
token.end,
nToken,
token.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
index b70e82490e2775..833ca16c5cbb45 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
@@ -37,7 +37,7 @@ class Stemmer(override val uid: String) extends AnnotatorModel[Stemmer] {
val stem = EnglishStemmer.stem(tokenAnnotation.result)
Annotation(
annotatorType,
- tokenAnnotation.begin,
+ tokenAnnotation.start,
tokenAnnotation.end,
stem,
tokenAnnotation.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index f08bdc2ab5f75f..350a877ad0759a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -3,10 +3,10 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.annotators.common._
import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
import org.apache.spark.ml.param.{Param, StringArrayParam}
-import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
+import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
-import scala.util.matching.Regex
+import scala.collection.mutable.ArrayBuffer
/**
* Tokenizes raw text into word pieces, tokens.
@@ -17,10 +17,11 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
import com.johnsnowlabs.nlp.AnnotatorType._
- val wordPattern: Param[String] = new Param(this, "wordPattern", "this is the base word pattern. Defaults \\w+")
- val extensionPattern: StringArrayParam = new StringArrayParam(this, "infixPattern", "infix patterns allow for word exceptions that count as single token. E.g. U.S.A. Defaults ")
- val prefixPattern: StringArrayParam = new StringArrayParam(this, "prefixPattern", "this is the token pattern")
- val suffixPattern: StringArrayParam = new StringArrayParam(this, "suffixPattern", "this is the token pattern")
+ val compositeTokens: StringArrayParam = new StringArrayParam(this, "compositeTokens", "Words that won't be split in two")
+ val targetPattern: Param[String] = new Param(this, "targetPattern", "pattern to grab from text as token candidates. Defaults \\S+")
+ val infixPatterns: StringArrayParam = new StringArrayParam(this, "infixPattern", "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults")
+ val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\w\\$\\.]*)")
+ val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\w]?)([^\\s\\w]*)\\z")
override val annotatorType: AnnotatorType = TOKEN
@@ -29,59 +30,125 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
- def setWordPattern(value: String): this.type = set(wordPattern, value)
+ def setTargetPattern(value: String): this.type = set(targetPattern, value)
- def setExtensionPattern(value: Array[String]): this.type = set(extensionPattern, value)
+ def setExtensionPatterns(value: Array[String]): this.type = set(infixPatterns, value)
- def addExtensionPattern(value: String): this.type = set(extensionPattern, $(extensionPattern) :+ value)
+ def addInfixPattern(value: String): this.type = set(infixPatterns, value +: $(infixPatterns))
- def setPrefixPattern(value: Array[String]): this.type = set(prefixPattern, value)
+ def setPrefixPattern(value: String): this.type = set(prefixPattern, value)
- def addPrefixPattern(value: String): this.type = set(prefixPattern, $(prefixPattern) :+ value)
+ def setSuffixPattern(value: String): this.type = set(suffixPattern, value)
- def setSuffixPattern(value: Array[String]): this.type = set(suffixPattern, value)
+ def setCompositeTokens(value: Array[String]): this.type = set(compositeTokens, value)
- def addSuffixPattern(value: String): this.type = set(suffixPattern, $(suffixPattern) :+ value)
+ def getCompositeTokens: Array[String] = $(compositeTokens)
- def getWordPattern: String = $(wordPattern)
+ def getInfixPatterns: Array[String] = $(infixPatterns)
- def getInfixPattern: Array[String] = $(extensionPattern)
+ def getPrefixPattern: String = $(prefixPattern)
- def getPrefixPattern: Array[String] = $(prefixPattern)
+ def getSuffixPattern: String = $(suffixPattern)
- def getSuffixPattern: Array[String] = $(suffixPattern)
+ def getTargetPattern: String = $(targetPattern)
setDefault(inputCols, Array(DOCUMENT))
- setDefault(wordPattern, "\\w+")
- setDefault(extensionPattern, Array("\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*"))
- setDefault(prefixPattern, Array("([^\\s\\w]?)"))
- setDefault(suffixPattern, Array("([^\\s\\w]?)([^\\s\\w]*)"))
+ lazy private val ruleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
- val ruleFactory = new RuleFactory(MatchStrategy.MATCH_ALL)
-
- override def beforeAnnotate(): Unit = {
- /** Clears out rules and constructs a new rule for every combination of rules provided */
- /** The strategy is to catch one token per regex group */
- /** User may add its own groups if needs targets to be tokenized separately from the rest */
- /** "([^\s\w]?)(\w+(?:\.(?:\w{1}\.)+|(?:\-\w+)*)?)([^\s\w]?)([\s\w]*)" */
- /** */
+ /** Clears out rules and constructs a new rule for every combination of rules provided */
+ /** The strategy is to catch one token per regex group */
+ /** User may add its own groups if needs targets to be tokenized separately from the rest */
+ protected def setFactoryRules(): Unit = {
ruleFactory
.clearRules()
- $(prefixPattern).foreach(pp => $(suffixPattern).foreach (sp => $(extensionPattern).foreach(ep => {
- ruleFactory.addRule(
- (pp + "(" + $(wordPattern) + "(?:" + ep + ")?" + ")" + sp).r,
- "tokenizer construction pattern"
- )
- })))
+ val rules = ArrayBuffer.empty[String]
+ require($(infixPatterns).nonEmpty)
+ require($(infixPatterns).forall(ip => ip.contains("(") && ip.contains(")")),
+ "infix patterns must use regex group. Notice each group will result in separate token")
+ $(infixPatterns).foreach(ip => {
+ val rule = new StringBuilder
+ get(prefixPattern).orElse(getDefault(prefixPattern)).foreach(pp => {
+ require(pp.startsWith("\\A"), "prefixPattern must begin with \\A to ensure it is the beginning of the string")
+ require(pp.contains("(") && pp.contains(")"), "prefixPattern must contain regex groups. Each group will return in separate token")
+ rule.append(pp)
+ })
+ rule.append(ip)
+ get(suffixPattern).orElse(getDefault(suffixPattern)).foreach(sp => {
+ require(sp.endsWith("\\z"), "suffixPattern must end with \\z to ensure it is the end of the string")
+ require(sp.contains("(") && sp.contains(")"), "suffixPattern must contain regex groups. Each group will return in separate token")
+ rule.append(sp)
+ })
+ rules.append(rule.toString)
+ })
+ rules.foreach(rule => ruleFactory.addRule(rule.r, rule))
}
+ /** Check here for explanation on this default pattern */
+ setDefault(infixPatterns, Array(
+ "((?:\\w+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
+ "(\\w+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
+ "(\\w+)('{1}\\w+)", // http://rubular.com/r/N84PYwYjQp
+ "((?:\\w+[^\\s\\w]{1})+\\w+)", // http://rubular.com/r/wOvQcey9e3
+ "(\\w+)" // basic word token
+ ))
+ /** These catch everything before and after a word, as a separate token*/
+ setDefault(prefixPattern, "\\A([^\\s\\w\\$\\.]*)")
+ setDefault(suffixPattern, "([^\\s\\w]?)([^\\s\\w]*)\\z")
+ setDefault(targetPattern, "\\S+")
+
+ setFactoryRules()
+
+ override def beforeAnnotate(): Unit = {
+ setFactoryRules()
+ }
+
+ private val PROTECT_STR = "ↈ"
+
def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
sentences.map{text =>
- val tokens = ruleFactory.findMatch(text.content).flatMap { m =>
- (1 to m.content.groupCount)
- .map (i => IndexedToken(m.content.group(i), text.begin + m.content.start, text.begin + m.content.end - 1))
- }.filter(t => t.token.nonEmpty).toArray
+ /** Step 1, protect exception words from being broken*/
+ var protected_text = text.content
+ if (get(compositeTokens).isDefined) {
+ $(compositeTokens).foreach(tokenException =>
+ protected_text = protected_text.replaceAll(
+ tokenException,
+ tokenException.replaceAll("[^(?:" + $(targetPattern) + ")]", PROTECT_STR)
+ )
+ )
+ }
+ /** Step 2, Return protected exception tokens back into text and move on*/
+ val tokens = $(targetPattern).r.findAllMatchIn(protected_text).flatMap { candidate =>
+ if (get(compositeTokens).isDefined && candidate.matched.contains(PROTECT_STR)) {
+ /** Put back character and move on */
+ Seq(IndexedToken(
+ text.content.slice(text.start + candidate.start, text.start + candidate.end),
+ text.start + candidate.start,
+ text.start + candidate.end - 1
+ ))
+ }
+ else {
+ /** Step 3, If no exception found, find candidates through the possible general rule patterns*/
+ ruleFactory.findMatchFirstOnly(candidate.matched).map {m =>
+ var curPos = m.content.start
+ (1 to m.content.groupCount)
+ .map (i => {
+ val target = m.content.group(i)
+ val it = IndexedToken(
+ target,
+ text.start + candidate.start + curPos,
+ text.start + candidate.start + curPos + target.length - 1
+ )
+ curPos += target.length
+ it
+ })
+ /** Step 4, If rules didn't match, return whatever candidate we have and leave it as is*/
+ }.getOrElse(Seq(IndexedToken(
+ candidate.matched,
+ text.start + candidate.start,
+ text.start + candidate.end - 1
+ )))
+ }}.toArray.filter(t => t.token.nonEmpty)
TokenizedSentence(tokens)
}
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
index 2df2d514070227..3beb73cb97e217 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
@@ -15,7 +15,7 @@ object DependencyParsed extends Annotated[DependencyParsedSentence]{
val sentences = Tokenized.unpack(annotations)
val depAnnotations = annotations
.filter(a => a.annotatorType == annotatorType)
- .sortBy(a => a.begin)
+ .sortBy(a => a.start)
var last = 0
sentences.map{sentence =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
index 0a2d16b5aaa7b9..badb8644ffb316 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
@@ -5,7 +5,7 @@ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
/**
* structure representing a sentence and its boundaries
*/
-case class Sentence(content: String, begin: Int, end: Int)
+case class Sentence(content: String, start: Int, end: Int)
object Sentence {
def fromTexts(texts: String*): Seq[Sentence] = {
@@ -27,11 +27,11 @@ object SentenceSplit extends Annotated[Sentence] {
override def unpack(annotations: Seq[Annotation]): Seq[Sentence] = {
annotations.filter(_.annotatorType == annotatorType)
.map(annotation =>
- Sentence(annotation.result, annotation.begin, annotation.end)
+ Sentence(annotation.result, annotation.start, annotation.end)
)
}
override def pack(items: Seq[Sentence]): Seq[Annotation] = {
- items.map(item => Annotation(annotatorType, item.begin, item.end, item.content, Map.empty[String, String]))
+ items.map(item => Annotation(annotatorType, item.start, item.end, item.content, Map.empty[String, String]))
}
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
index b64c5fb33bfc81..629a8099197544 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
@@ -15,17 +15,17 @@ trait Tagged[T >: TaggedSentence <: TaggedSentence] extends Annotated[T] {
val tokenized = Tokenized.unpack(annotations)
val tagAnnotations = annotations
.filter(a => a.annotatorType == annotatorType)
- .sortBy(a => a.begin)
+ .sortBy(a => a.start)
.toIterator
var annotation: Option[Annotation] = None
tokenized.map { sentence =>
val tokens = sentence.indexedTokens.map { token =>
- while (tagAnnotations.hasNext && (annotation.isEmpty || annotation.get.begin < token.begin))
+ while (tagAnnotations.hasNext && (annotation.isEmpty || annotation.get.start < token.begin))
annotation = Some(tagAnnotations.next)
- val tag = if (annotation.isDefined && annotation.get.begin == token.begin)
+ val tag = if (annotation.isDefined && annotation.get.start == token.begin)
annotation.get.result
else
emptyTag
@@ -69,7 +69,7 @@ trait Tagged[T >: TaggedSentence <: TaggedSentence] extends Annotated[T] {
}
protected def getLabels(sentences: Seq[TaggedSentence], labelAnnotations: Seq[Annotation]): Seq[TextSentenceLabels] = {
- val position2Tag = labelAnnotations.map(a => (a.begin, a.end) -> a.result).toMap
+ val position2Tag = labelAnnotations.map(a => (a.start, a.end) -> a.result).toMap
sentences.map{sentence =>
val labels = sentence.indexedTaggedWords.map { w =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
index e02bec3e79e339..c1128c5be7c976 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
@@ -13,8 +13,8 @@ object Tokenized extends Annotated[TokenizedSentence] {
SentenceSplit.unpack(annotations).map(sentence => {
tokens.filter(token =>
- token.begin >= sentence.begin & token.end <= sentence.end
- ).map(token => IndexedToken(token.result, token.begin, token.end))
+ token.start >= sentence.start & token.end <= sentence.end
+ ).map(token => IndexedToken(token.result, token.start, token.end))
}).filter(_.nonEmpty).map(indexedTokens => TokenizedSentence(indexedTokens))
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
index ca0432788c411b..d78b133a18cd89 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
@@ -199,7 +199,7 @@ class PragmaticContentFormatter(text: String) {
val factory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
// http://rubular.com/r/G2opjedIm9
//special periods
- .addRule(new RegexRule("http://rubular.com/r/G2opjedIm9", "formatGeo"))
+ .addRule(new RegexRule("(?<=[a-zA-z]°)\\.(?=\\s*\\d+)", "formatGeo"))
wip = factory.transformWithSymbol(MULT_PERIOD, wip)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
index e12e0899eedb64..2f3814196d63e7 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
@@ -237,7 +237,7 @@ class NorvigSweetingModel(override val uid: String) extends AnnotatorModel[Norvi
annotations.map { token =>
Annotation(
annotatorType,
- token.begin,
+ token.start,
token.end,
check(token.result),
token.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
index a3d4837011d899..26eb40187b135a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
@@ -370,7 +370,7 @@ object ResourceHelper {
val tokenizer = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")
- .setWordPattern(tokenPattern)
+ .setTargetPattern(tokenPattern)
val normalizer = new Normalizer()
.setInputCols("token")
.setOutputCol("normal")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
index f7a544d9b1d650..70c2ac5cafbcda 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
@@ -69,7 +69,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
case _ => throw new IllegalArgumentException("Invalid match strategy")
}
- private val transformWithSymbolFunc = (text: String, symbol: String) => transformStrategy match {
+ private val transformWithSymbolFunc = (symbol: String, text: String) => transformStrategy match {
case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
logger.debug("Matched: {} from: {} using rule {} with strategy {}",
() => m.matched,
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
index 30fc4466f57010..03167277af7ba1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
@@ -103,13 +103,13 @@ class AnnotatorBaseTestSpec extends FlatSpec {
val contentMeta = result.select("demand", "result").take(1).head.getSeq[Row](0)
val contentAnnotation = contentMeta.map(Annotation(_)).head
assert(contentAnnotation.annotatorType == dummyAnnotator.annotatorType)
- assert(contentAnnotation.begin == 0)
+ assert(contentAnnotation.start == 0)
assert(contentAnnotation.end == 25)
assert(contentAnnotation.metadata.contains("a") && contentAnnotation.metadata("a") == "b")
val demandContentMeta = result.select("demand", "result").take(1).head.getSeq[Row](1)
val demandContentAnnotation = demandContentMeta.map(Annotation(_)).head
assert(demandContentAnnotation.annotatorType == demandingDummyAnnotator.annotatorType)
- assert(demandContentAnnotation.begin == 11)
+ assert(demandContentAnnotation.start == 11)
assert(demandContentAnnotation.end == 18)
assert(demandContentAnnotation.metadata.contains("aa") && demandContentAnnotation.metadata("aa") == "bb")
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
index 448fae3b41de92..326fed4adc2ae1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
@@ -18,7 +18,7 @@ class DocumentAssemblerTestSpec extends FlatSpec {
"A DocumentAssembler" should "annotate with the correct indexes" in {
val f = fixture
- f.text.head should equal (f.text(f.assembledDoc.head.begin))
+ f.text.head should equal (f.text(f.assembledDoc.head.start))
f.text.last should equal (f.text(f.assembledDoc.head.end))
}
}
\ No newline at end of file
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
index 0572a63009172e..57f7ec8d50024f 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
@@ -27,17 +27,16 @@ trait NormalizerBehaviors { this: FlatSpec =>
AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
.collect().foreach {
row =>
- val tokens = row.getSeq[Row](3).map(Annotation(_))
+ val tokens = row.getSeq[Row](3).map(Annotation(_)).filterNot(a => a.result == "." || a.result == ",")
val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
normalizedAnnotations.foreach {
- case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
- assert(stem.result.nonEmpty, "Annotation result exists")
+ case nToken: Annotation if nToken.annotatorType == AnnotatorType.TOKEN =>
+ assert(nToken.result.nonEmpty, "Annotation result exists")
case _ =>
}
-
normalizedAnnotations.zip(tokens).foreach {
- case (stem: Annotation, token: Annotation) =>
- assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
+ case (nToken: Annotation, token: Annotation) =>
+ assert(nToken.result == token.result.replaceAll("[^a-zA-Z]", ""))
}
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
index bdb47d64f953a3..415bf09e92e371 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
@@ -51,8 +51,8 @@ trait TokenizerBehaviors { this: FlatSpec =>
val f = fixture(dataset)
f.tokensAnnotations.foreach { a =>
val token = a.result
- val sentenceToken = f.corpus.slice(a.begin, a.end + 1)
- assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.begin},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
+ val sentenceToken = f.corpus.slice(a.start, a.end + 1)
+ assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.start},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
}
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 3f692ee79517d8..688c420a1f7ad5 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -13,44 +13,52 @@ import org.apache.spark.ml.Pipeline
*/
class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
+ import SparkAccessor.spark.implicits._
+
val regexTokenizer = new Tokenizer
"a Tokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN)
}
- "a Tokenizer" should "correctly tokenize target text on its defaults parameters" in {
- val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
- import data.sparkSession.implicits._
+
+ val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
+ val expected = Array(
+ "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+ "Give", "me", "my", "horse", "!", "or", "$100", "bucks", "'", "He", "said", "'", ",", "I", "'ll", "defeat", "markus-crassus", "."
+ )
+
+ "a Tokenizer" should "correctly tokenize target text on its defaults parameters with exceptions" in {
+ val data = DataBuilder.basicDataBuild(targetText)
val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
- val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token")
- val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+ val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York", "won't"))
+ val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setCleanAnnotations(false).setOutputCols("output")
val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
- val expected = Array(
- "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
- "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
- )
- val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+ val pip = pipeline.fit(data).transform(data)
+ val result = pip
+ .select("output").as[Array[String]]
.collect.flatten
assert(
result.sameElements(expected),
s"because result tokens differ: " +
s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
)
+ pip
+ .select("token").as[Array[Annotation]]
+ .collect.foreach(annotations => {
+ annotations.foreach(annotation => {
+ assert(targetText.slice(annotation.start, annotation.end + 1) == annotation.result)
+ })
+ })
}
- "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters" in {
- val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
- import data.sparkSession.implicits._
+ "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters with exceptions" in {
+ val data = DataBuilder.basicDataBuild(targetText)
val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
val sentence = new SentenceDetector().setInputCols("document").setOutputCol("sentence")
- val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token")
+ val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token").setCompositeTokens(Array("New York"))
val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
val pipeline = new Pipeline().setStages(Array(document, sentence, tokenizer, finisher))
- val expected = Array(
- "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
- "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
- )
val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
.collect.flatten
assert(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
index 327f84e7d80738..5ffcd439c6bb51 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
@@ -35,7 +35,7 @@ class NerCrfApproachSpec extends FlatSpec {
assert(annotations.length == labels.length)
for ((annotation, label) <- annotations.zip(labels)) {
- assert(annotation.begin == label.begin)
+ assert(annotation.start == label.start)
assert(annotation.end == label.end)
assert(annotation.annotatorType == AnnotatorType.NAMED_ENTITY)
assert(annotation.result == label.result)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
index c76e272bf94e5e..e09a7dddcb2bea 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
@@ -52,6 +52,6 @@ class DependencyParserApproachTest extends FlatSpec {
val f = fixture
f.depAnnotations
.zip(f.tokenAnnotations)
- .foreach { case (dep, token) => assert(dep.begin == token.begin && dep.end == token.end, s"Token and word should have equal indixes") }
+ .foreach { case (dep, token) => assert(dep.start == token.start && dep.end == token.end, s"Token and word should have equal indixes") }
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
index 7a92c52cca9ede..306c1de43d0743 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
@@ -14,15 +14,15 @@ class GreedyTransitionApproachTest extends FlatSpec {
val tokenAnnotations = Annotation.collect(df, "token")
.flatten
- .sortBy { _.begin }
+ .sortBy { _.start }
val posTagAnnotations = Annotation.collect(df, "pos")
.flatten
- .sortBy { _.begin }
+ .sortBy { _.start }
val sentenceAnnotation = Annotation.collect(df, "sentence")
.flatten
- .sortBy { _.begin }
+ .sortBy { _.start }
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
index 51ea37f3235051..80beba5221be73 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
@@ -30,7 +30,6 @@ class PerceptronApproachTestSpec extends FlatSpec with PerceptronApproachBehavio
length += text.length + 1
sentence
}
-
new Tokenizer().tag(sentences).toArray
}
From b302ad518980884c24b66146be8f119329fdb1c5 Mon Sep 17 00:00:00 2001
From: Saif Addin
Date: Sat, 27 Jan 2018 03:13:07 -0300
Subject: [PATCH 5/9] - Fixed Annotation field name
---
src/main/scala/com/johnsnowlabs/nlp/Annotation.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
index f5b7a470762e8b..cf5d925b5081da 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
@@ -39,7 +39,7 @@ object Annotation {
/** This is spark type of an annotation representing its metadata shape */
val dataType = new StructType(Array(
StructField("annotatorType", StringType, nullable = true),
- StructField("begin", IntegerType, nullable = false),
+ StructField("start", IntegerType, nullable = false),
StructField("end", IntegerType, nullable = false),
StructField("result", StringType, nullable = true),
StructField("metadata", MapType(StringType, StringType), nullable = true)
From a68f5a68d1ab011e90901dc7c1f2239a1ff2be52 Mon Sep 17 00:00:00 2001
From: Saif Addin
Date: Sat, 27 Jan 2018 15:51:19 -0300
Subject: [PATCH 6/9] - Features now truly lazy - Removed beforeAnnotate, back
to lazy
---
.../com/johnsnowlabs/nlp/AnnotatorModel.scala | 4 --
.../com/johnsnowlabs/nlp/HasFeatures.scala | 6 +--
.../nlp/ParamsAndFeaturesWritable.scala | 2 +-
.../nlp/annotators/Tokenizer.scala | 14 +------
.../nlp/serialization/Feature.scala | 38 ++++++++++++++++---
.../nlp/annotators/TokenizerTestSpec.scala | 2 +-
.../ner/crf/NerCrfApproachSpec.scala | 8 ++--
7 files changed, 43 insertions(+), 31 deletions(-)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
index 4fca2ce01d9a52..d3ed36a2daff82 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
@@ -53,9 +53,6 @@ abstract class AnnotatorModel[M <: Model[M]]
StructType(outputFields)
}
- /** override this function if you need to reset or clear annotate variables just once before annotating */
- def beforeAnnotate(): Unit = {}
-
/**
* Given requirements are met, this applies ML transformation within a Pipeline or stand-alone
* Output annotation will be generated as a new column, previous annotations are still available separately
@@ -68,7 +65,6 @@ abstract class AnnotatorModel[M <: Model[M]]
s"${requiredAnnotatorTypes.mkString(", ")}")
val metadataBuilder: MetadataBuilder = new MetadataBuilder()
metadataBuilder.putString("annotatorType", annotatorType)
- beforeAnnotate()
dataset.withColumn(
getOutputCol,
dfAnnotate(
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala b/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
index d713366ae16eac..ab6c6035046a51 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
@@ -26,10 +26,10 @@ trait HasFeatures {
protected def get[T](feature: StructFeature[T]): Option[T] = feature.get
- protected def $$[T](feature: ArrayFeature[T]): Array[T] = feature.getValue
+ protected def $$[T](feature: ArrayFeature[T]): Array[T] = feature.getOrDefault
- protected def $$[K, V](feature: MapFeature[K, V]): Map[K, V] = feature.getValue
+ protected def $$[K, V](feature: MapFeature[K, V]): Map[K, V] = feature.getOrDefault
- protected def $$[T](feature: StructFeature[T]): T = feature.getValue
+ protected def $$[T](feature: StructFeature[T]): T = feature.getOrDefault
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
index aac623b487c02d..9e71fc2884e9dd 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
@@ -11,7 +11,7 @@ class FeaturesWriter[T](annotatorWithFeatures: HasFeatures, baseWriter: MLWriter
baseWriter.save(path)
for (feature <- annotatorWithFeatures.features) {
- feature.serializeInfer(sparkSession, path, feature.name, feature.getValue)
+ feature.serializeInfer(sparkSession, path, feature.name, feature.getOrDefault)
}
onWritten(path, sparkSession)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index 350a877ad0759a..efcad6d59373a0 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -54,14 +54,10 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
setDefault(inputCols, Array(DOCUMENT))
- lazy private val ruleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
-
/** Clears out rules and constructs a new rule for every combination of rules provided */
/** The strategy is to catch one token per regex group */
/** User may add its own groups if needs targets to be tokenized separately from the rest */
- protected def setFactoryRules(): Unit = {
- ruleFactory
- .clearRules()
+ lazy private val ruleFactory = {
val rules = ArrayBuffer.empty[String]
require($(infixPatterns).nonEmpty)
require($(infixPatterns).forall(ip => ip.contains("(") && ip.contains(")")),
@@ -81,7 +77,7 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
})
rules.append(rule.toString)
})
- rules.foreach(rule => ruleFactory.addRule(rule.r, rule))
+ rules.foldLeft(new RuleFactory(MatchStrategy.MATCH_FIRST))((factory, rule) => factory.addRule(rule.r, rule))
}
/** Check here for explanation on this default pattern */
@@ -97,12 +93,6 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
setDefault(suffixPattern, "([^\\s\\w]?)([^\\s\\w]*)\\z")
setDefault(targetPattern, "\\S+")
- setFactoryRules()
-
- override def beforeAnnotate(): Unit = {
- setFactoryRules()
- }
-
private val PROTECT_STR = "ↈ"
def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index f3a251f533423b..3839970bda238d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -2,23 +2,29 @@ package com.johnsnowlabs.nlp.serialization
import com.johnsnowlabs.nlp.HasFeatures
import com.johnsnowlabs.nlp.util.ConfigHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import scala.reflect.ClassTag
-abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model: HasFeatures, val name: String)(implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate()) extends Serializable {
+abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model: HasFeatures, val name: String) extends Serializable {
model.features.append(this)
private val config = ConfigHelper.retrieve
+ private val spark = ResourceHelper.spark
val serializationMode: String = config.getString("performance.serialization")
val useBroadcast: Boolean = config.getBoolean("performance.useBroadcast")
final protected var broadcastValue: Option[Broadcast[TComplete]] = None
+ final protected var fallbackBroadcastValue: Option[Broadcast[TComplete]] = None
+
final protected var rawValue: Option[TComplete] = None
- final protected var fallback: Option[() => TComplete] = None
+ final protected var fallbackRawValue: Option[TComplete] = None
+
+ final protected var fallbackLazyValue: Option[() => TComplete] = None
final def serialize(spark: SparkSession, path: String, field: String, value: TComplete): Unit = {
serializationMode match {
@@ -52,18 +58,38 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
final protected def getFieldPath(path: String, field: String): Path =
Path.mergePaths(new Path(path), new Path("/fields/" + field))
+ private def callAndSetFallback: Option[TComplete] = {
+ if (useBroadcast) {
+ fallbackBroadcastValue = Some(spark.sparkContext.broadcast[TComplete](fallbackLazyValue.get.asInstanceOf[TComplete]))
+ fallbackBroadcastValue.map(_.value)
+ } else {
+ fallbackRawValue = fallbackLazyValue.map(_())
+ fallbackRawValue
+ }
+ }
+
final def get: Option[TComplete] = {
broadcastValue.map(_.value).orElse(rawValue)
}
- final def getValue: TComplete = {
- broadcastValue.map(_.value).orElse(rawValue).orElse(fallback.map(_())).getOrElse(throw new Exception(s"feature $name is not set"))
+ final def getOrDefault: TComplete = {
+ if (useBroadcast) {
+ broadcastValue.map(_.value)
+ .orElse(fallbackBroadcastValue.map(_.value))
+ .orElse(callAndSetFallback)
+ .getOrElse(throw new Exception(s"feature $name is not set"))
+ } else {
+ rawValue
+ .orElse(fallbackRawValue)
+ .orElse(callAndSetFallback)
+ .getOrElse(throw new Exception(s"feature $name is not set"))
+ }
}
final def setValue(v: Option[Any]): HasFeatures = {
if (useBroadcast) {
if (isSet) broadcastValue.get.destroy()
- broadcastValue = Some(sparkSession.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
+ broadcastValue = Some(spark.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
} else {
rawValue = Some(v.get.asInstanceOf[TComplete])
}
@@ -71,7 +97,7 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
}
def setFallback(v: Option[() => TComplete]): HasFeatures = {
- fallback = v
+ fallbackLazyValue = v
model
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 688c420a1f7ad5..001dca7d8eaf99 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -31,7 +31,7 @@ class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
"a Tokenizer" should "correctly tokenize target text on its defaults parameters with exceptions" in {
val data = DataBuilder.basicDataBuild(targetText)
val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
- val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York", "won't"))
+ val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York"))
val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setCleanAnnotations(false).setOutputCols("output")
val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
val pip = pipeline.fit(data).transform(data)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
index 5ffcd439c6bb51..b72dacc0045642 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
@@ -16,14 +16,14 @@ class NerCrfApproachSpec extends FlatSpec {
nerModel.write.overwrite.save("./test_crf_pipeline")
val loadedNer = NerCrfModel.read.load("./test_crf_pipeline")
- assert(nerModel.model.getValue.serialize == loadedNer.model.getValue.serialize)
- assert(nerModel.dictionaryFeatures.getValue == loadedNer.dictionaryFeatures.getValue)
+ assert(nerModel.model.getOrDefault.serialize == loadedNer.model.getOrDefault.serialize)
+ assert(nerModel.dictionaryFeatures.getOrDefault == loadedNer.dictionaryFeatures.getOrDefault)
}
"NerCrfApproach" should "have correct set of labels" in {
assert(nerModel.model.isSet)
- val metadata = nerModel.model.getValue.metadata
+ val metadata = nerModel.model.getOrDefault.metadata
assert(metadata.labels.toSeq == Seq("@#Start", "PER", "O", "ORG", "LOC"))
}
@@ -65,7 +65,7 @@ class NerCrfApproachSpec extends FlatSpec {
"NerCrfModel" should "correctly handle entities param" in {
val restrictedModel = new NerCrfModel()
.setEntities(Array("PER", "LOC"))
- .setModel(nerModel.model.getValue)
+ .setModel(nerModel.model.getOrDefault)
.setOutputCol(nerModel.getOutputCol)
.setInputCols(nerModel.getInputCols)
From a234dd6030480f28fc5717811386afb67fbb319d Mon Sep 17 00:00:00 2001
From: Saif Addin
Date: Sat, 27 Jan 2018 16:30:30 -0300
Subject: [PATCH 7/9] - Fixed bad lazy call
---
src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index 3839970bda238d..989318c56bf23c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -60,7 +60,7 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
private def callAndSetFallback: Option[TComplete] = {
if (useBroadcast) {
- fallbackBroadcastValue = Some(spark.sparkContext.broadcast[TComplete](fallbackLazyValue.get.asInstanceOf[TComplete]))
+ fallbackBroadcastValue = fallbackLazyValue.map(v => spark.sparkContext.broadcast[TComplete](v()))
fallbackBroadcastValue.map(_.value)
} else {
fallbackRawValue = fallbackLazyValue.map(_())
From 562e254ceae92c34173f08cdc788976c31f5d7fc Mon Sep 17 00:00:00 2001
From: Saif Addin
Date: Sat, 27 Jan 2018 17:16:05 -0300
Subject: [PATCH 8/9] - Fixed NullPointerException. Fallback Default Feature
value may not be broadcast
---
.../nlp/serialization/Feature.scala | 32 ++++++-------------
1 file changed, 10 insertions(+), 22 deletions(-)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index 989318c56bf23c..404e3a948bb86d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -19,7 +19,6 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
val useBroadcast: Boolean = config.getBoolean("performance.useBroadcast")
final protected var broadcastValue: Option[Broadcast[TComplete]] = None
- final protected var fallbackBroadcastValue: Option[Broadcast[TComplete]] = None
final protected var rawValue: Option[TComplete] = None
final protected var fallbackRawValue: Option[TComplete] = None
@@ -59,13 +58,8 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
Path.mergePaths(new Path(path), new Path("/fields/" + field))
private def callAndSetFallback: Option[TComplete] = {
- if (useBroadcast) {
- fallbackBroadcastValue = fallbackLazyValue.map(v => spark.sparkContext.broadcast[TComplete](v()))
- fallbackBroadcastValue.map(_.value)
- } else {
- fallbackRawValue = fallbackLazyValue.map(_())
- fallbackRawValue
- }
+ fallbackRawValue = fallbackLazyValue.map(_())
+ fallbackRawValue
}
final def get: Option[TComplete] = {
@@ -73,25 +67,19 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
}
final def getOrDefault: TComplete = {
- if (useBroadcast) {
- broadcastValue.map(_.value)
- .orElse(fallbackBroadcastValue.map(_.value))
- .orElse(callAndSetFallback)
- .getOrElse(throw new Exception(s"feature $name is not set"))
- } else {
- rawValue
- .orElse(fallbackRawValue)
- .orElse(callAndSetFallback)
- .getOrElse(throw new Exception(s"feature $name is not set"))
- }
+ broadcastValue.map(_.value)
+ .orElse(rawValue)
+ .orElse(fallbackRawValue)
+ .orElse(callAndSetFallback)
+ .getOrElse(throw new Exception(s"feature $name is not set"))
}
- final def setValue(v: Option[Any]): HasFeatures = {
+ final def setValue(value: Option[Any]): HasFeatures = {
if (useBroadcast) {
if (isSet) broadcastValue.get.destroy()
- broadcastValue = Some(spark.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
+ broadcastValue = value.map(v => spark.sparkContext.broadcast[TComplete](v.asInstanceOf[TComplete]))
} else {
- rawValue = Some(v.get.asInstanceOf[TComplete])
+ rawValue = Some(value.get.asInstanceOf[TComplete])
}
model
}
From 892f782a37feef7e0b84ff60650252a98ce7b250 Mon Sep 17 00:00:00 2001
From: Saif Addin
Date: Sat, 27 Jan 2018 17:50:02 -0300
Subject: [PATCH 9/9] - language agnostic tokenizer defaults
---
.../nlp/annotators/Tokenizer.scala | 18 +++++++++---------
.../nlp/annotators/TokenizerTestSpec.scala | 4 ++--
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index efcad6d59373a0..4f4e63a65302b0 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -20,8 +20,8 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
val compositeTokens: StringArrayParam = new StringArrayParam(this, "compositeTokens", "Words that won't be split in two")
val targetPattern: Param[String] = new Param(this, "targetPattern", "pattern to grab from text as token candidates. Defaults \\S+")
val infixPatterns: StringArrayParam = new StringArrayParam(this, "infixPattern", "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults")
- val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\w\\$\\.]*)")
- val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\w]?)([^\\s\\w]*)\\z")
+ val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\p{L}$\\.]*)")
+ val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\p{L}]?)([^\\s\\p{L}]*)\\z")
override val annotatorType: AnnotatorType = TOKEN
@@ -82,15 +82,15 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
/** Check here for explanation on this default pattern */
setDefault(infixPatterns, Array(
- "((?:\\w+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
- "(\\w+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
- "(\\w+)('{1}\\w+)", // http://rubular.com/r/N84PYwYjQp
- "((?:\\w+[^\\s\\w]{1})+\\w+)", // http://rubular.com/r/wOvQcey9e3
- "(\\w+)" // basic word token
+ "((?:\\p{L}+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
+ "(\\p{L}+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
+ "(\\p{L}+)('{1}\\p{L}+)", // http://rubular.com/r/N84PYwYjQp
+ "((?:\\p{L}+[^\\s\\p{L}]{1})+\\p{L}+)", // http://rubular.com/r/wOvQcey9e3
+ "(\\p{L}+)" // basic word token
))
/** These catch everything before and after a word, as a separate token*/
- setDefault(prefixPattern, "\\A([^\\s\\w\\$\\.]*)")
- setDefault(suffixPattern, "([^\\s\\w]?)([^\\s\\w]*)\\z")
+ setDefault(prefixPattern, "\\A([^\\s\\p{L}$\\.]*)")
+ setDefault(suffixPattern, "([^\\s\\p{L}]?)([^\\s\\p{L}]*)\\z")
setDefault(targetPattern, "\\S+")
private val PROTECT_STR = "ↈ"
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 001dca7d8eaf99..481abd7c9baa8a 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -22,9 +22,9 @@ class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
}
- val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
+ val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it héroe). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
val expected = Array(
- "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+ "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", "héroe", ")", ".",
"Give", "me", "my", "horse", "!", "or", "$100", "bucks", "'", "He", "said", "'", ",", "I", "'ll", "defeat", "markus-crassus", "."
)