diff --git a/docs/components.html b/docs/components.html index d191942d012412..57314aba919844 100644 --- a/docs/components.html +++ b/docs/components.html @@ -172,7 +172,7 @@
sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence") \
.setUseAbbreviations(True)
@@ -673,7 +673,7 @@ 9. SentenceDetector: Sentence B
Example:
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setInputCols("document")
.setOutputCol("sentence")
@@ -790,7 +790,7 @@ 11. SentimentDetector: Sentime
Example:
- sentiment_detector = SentimentDetectorModel() \
+ sentiment_detector = SentimentDetector() \
.setInputCols(["lemma", "sentence"]) \
.setOutputCol("sentiment")
@@ -825,7 +825,7 @@ 11. SentimentDetector: Sentime
Example:
- val sentimentDetector = new SentimentDetectorModel
+ val sentimentDetector = new SentimentDetector
.setInputCols(Array("token", "sentence"))
.setOutputCol("sentiment")
@@ -902,7 +902,7 @@ 13. SpellChecker: Token spell
Inputs: Any text for corpus. A list of words for dictionary. A
comma
separated custom dictionary.
- Requires: RegexTokenizer
+ Requires: Tokenizer
Functions:
-
@@ -947,7 +947,7 @@
13. SpellChecker: Token spell
Inputs: Any text for corpus. A list of words for dictionary. A
comma
separated custom dictionary.
- Requires: RegexTokenizer
+ Requires: Tokenizer
Functions:
-
@@ -1017,7 +1017,7 @@
14. ViveknSentimentDetec
Input: File or folder of text files of positive and negative data
Example:
- sentiment_detector = SentimentDetectorModel() \
+ sentiment_detector = SentimentDetector() \
.setInputCols(["lemma", "sentence"]) \
.setOutputCol("sentiment")
@@ -1225,7 +1225,7 @@ 16. TokenAssembler: Getting data
Annotators
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-val sentenceDetector = new SentenceDetectorModel()
+ import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")
-val regexTokenizer = new RegexTokenizer()
+val regexTokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb
index a29bbbbb5b301c..7bf620236fa9ff 100644
--- a/python/example/crf-ner/ner.ipynb
+++ b/python/example/crf-ner/ner.ipynb
@@ -101,11 +101,11 @@
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
- "sentenceDetector = SentenceDetectorModel()\\\n",
+ "sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- "tokenizer = RegexTokenizer()\\\n",
+ "tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
diff --git a/python/example/crf-ner/ner_benchmark.ipynb b/python/example/crf-ner/ner_benchmark.ipynb
index b63c636ebe4833..4877ef0db82e5f 100644
--- a/python/example/crf-ner/ner_benchmark.ipynb
+++ b/python/example/crf-ner/ner_benchmark.ipynb
@@ -182,11 +182,11 @@
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
- " sentenceDetector = SentenceDetectorModel()\\\n",
+ " sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- " tokenizer = RegexTokenizer()\\\n",
+ " tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
diff --git a/python/example/dictionary-sentiment/sentiment.ipynb b/python/example/dictionary-sentiment/sentiment.ipynb
index 81264b96d533b6..0726efe0ae50f3 100644
--- a/python/example/dictionary-sentiment/sentiment.ipynb
+++ b/python/example/dictionary-sentiment/sentiment.ipynb
@@ -45,11 +45,11 @@
"document_assembler = DocumentAssembler() \\\n",
" .setInputCol(\"text\")\n",
"\n",
- "sentence_detector = SentenceDetectorModel() \\\n",
+ "sentence_detector = SentenceDetector() \\\n",
" .setInputCols([\"document\"]) \\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- "tokenizer = RegexTokenizer() \\\n",
+ "tokenizer = Tokenizer() \\\n",
" .setInputCols([\"sentence\"]) \\\n",
" .setOutputCol(\"token\")\n",
"\n",
@@ -58,7 +58,7 @@
" .setOutputCol(\"lemma\") \\\n",
" .setDictionary(\"../../../src/test/resources/lemma-corpus/AntBNC_lemmas_ver_001.txt\")\n",
" \n",
- "sentiment_detector = SentimentDetectorModel() \\\n",
+ "sentiment_detector = SentimentDetector() \\\n",
" .setInputCols([\"lemma\", \"sentence\"]) \\\n",
" .setOutputCol(\"sentiment_score\") \\\n",
" .setDictPath(\"../../../src/test/resources/sentiment-corpus/default-sentiment-dict.txt\")\n",
diff --git a/python/example/entities-extractor/extractor.ipynb b/python/example/entities-extractor/extractor.ipynb
index 625d4523df7f41..a0d692f3bd2c1f 100644
--- a/python/example/entities-extractor/extractor.ipynb
+++ b/python/example/entities-extractor/extractor.ipynb
@@ -51,11 +51,11 @@
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
- "sentenceDetector = SentenceDetectorModel()\\\n",
+ "sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
- "tokenizer = RegexTokenizer()\\\n",
+ "tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
diff --git a/python/example/vivekn-sentiment/sentiment.ipynb b/python/example/vivekn-sentiment/sentiment.ipynb
index f453217ad1e14c..bf11f116846402 100644
--- a/python/example/vivekn-sentiment/sentiment.ipynb
+++ b/python/example/vivekn-sentiment/sentiment.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"collapsed": true
},
@@ -20,9 +20,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------+---------+--------------------+\n",
+ "|itemid|sentiment| text|\n",
+ "+------+---------+--------------------+\n",
+ "| 1| 0| ...|\n",
+ "| 2| 0| ...|\n",
+ "| 3| 1| omg...|\n",
+ "| 4| 0| .. Omga...|\n",
+ "| 5| 0| i think ...|\n",
+ "| 6| 0| or i jus...|\n",
+ "| 7| 1| Juuuuuuuuu...|\n",
+ "| 8| 0| Sunny Agai...|\n",
+ "| 9| 1| handed in m...|\n",
+ "| 10| 1| hmmmm.... i...|\n",
+ "| 11| 0| I must thin...|\n",
+ "| 12| 1| thanks to a...|\n",
+ "| 13| 0| this weeken...|\n",
+ "| 14| 0| jb isnt show...|\n",
+ "| 15| 0| ok thats it ...|\n",
+ "| 16| 0| <-------- ...|\n",
+ "| 17| 0| awhhe man.......|\n",
+ "| 18| 1| Feeling stran...|\n",
+ "| 19| 0| HUGE roll of ...|\n",
+ "| 20| 0| I just cut my...|\n",
+ "+------+---------+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"#Load the input data to be annotated\n",
"data = spark. \\\n",
@@ -36,7 +69,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"collapsed": true
},
@@ -59,7 +92,7 @@
"outputs": [],
"source": [
"### Sentence detector\n",
- "sentence_detector = SentenceDetectorModel() \\\n",
+ "sentence_detector = SentenceDetector() \\\n",
" .setInputCols([\"document\"]) \\\n",
" .setOutputCol(\"sentence\")\n",
"#sentence_data = sentence_detector.transform(checked)"
@@ -74,7 +107,7 @@
"outputs": [],
"source": [
"### Tokenizer\n",
- "tokenizer = RegexTokenizer() \\\n",
+ "tokenizer = Tokenizer() \\\n",
" .setInputCols([\"sentence\"]) \\\n",
" .setOutputCol(\"token\")\n",
"#tokenized = tokenizer.transform(assembled)"
@@ -154,7 +187,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"pipeline = Pipeline(stages=[\n",
@@ -178,7 +213,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"for r in sentiment_data.take(5):\n",
@@ -188,7 +225,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"start = time.time()\n",
@@ -201,7 +240,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"start = time.time()\n",
@@ -214,7 +255,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"start = time.time()\n",
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index dfecd5b8c6ca58..8c1050f10f6989 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -83,20 +83,52 @@ def __init__(self):
super(JavaTransformer, self).__init__()
-class RegexTokenizer(AnnotatorTransformer):
+class Tokenizer(AnnotatorTransformer):
- pattern = Param(Params._dummy(),
- "pattern",
- "regular expression pattern for tokenization",
+ targetPattern = Param(Params._dummy(),
+ "targetPattern",
+ "pattern to grab from text as token candidates. Defaults \S+",
typeConverter=TypeConverters.toString)
+ prefixPattern = Param(Params._dummy(),
+ "prefixPattern",
+ "regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
+ typeConverter=TypeConverters.toString)
+
+ suffixPatern = Param(Params._dummy(),
+ "suffixPatern",
+ "regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
+ typeConverter=TypeConverters.toString)
+
+ compositeTokens = Param(Params._dummy(),
+ "compositeTokens",
+ "Words that won't be split in two",
+ typeConverter=TypeConverters.toListString)
+
+ infixPatterns = Param(Params._dummy(),
+ "infixPatterns",
+ "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
+ typeConverter=TypeConverters.toListString)
+
@keyword_only
def __init__(self):
- super(RegexTokenizer, self).__init__()
- self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.RegexTokenizer", self.uid)
+ super(Tokenizer, self).__init__()
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.Tokenizer", self.uid)
- def setPattern(self, value):
- return self._set(pattern=value)
+ def setTargetPattern(self, value):
+ return self._set(targetPattern=value)
+
+ def setPrefixPattern(self, value):
+ return self._set(prefixPattern=value)
+
+ def setSuffixPattern(self, value):
+ return self._set(suffixPattern=value)
+
+ def setCompositeTokens(self, value):
+ return self._set(compositeTokens=value)
+
+ def setInfixPatterns(self, value):
+ return self._set(infixPatterns=value)
class Stemmer(AnnotatorTransformer):
@@ -307,7 +339,7 @@ class PerceptronModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProper
name = "PerceptronModel"
-class SentenceDetectorModel(AnnotatorTransformer):
+class SentenceDetector(AnnotatorTransformer):
useAbbreviations = Param(Params._dummy(),
"useAbbreviations",
@@ -329,11 +361,11 @@ def setUseAbbreviations(self, value):
@keyword_only
def __init__(self):
- super(SentenceDetectorModel, self).__init__()
- self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel", self.uid)
+ super(SentenceDetector, self).__init__()
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector", self.uid)
-class SentimentDetectorModel(AnnotatorTransformer):
+class SentimentDetector(AnnotatorTransformer):
dictPath = Param(Params._dummy(),
"dictPath",
"path for dictionary to sentiment analysis")
@@ -348,8 +380,8 @@ class SentimentDetectorModel(AnnotatorTransformer):
@keyword_only
def __init__(self):
- super(SentimentDetectorModel, self).__init__()
- self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel", self.uid)
+ super(SentimentDetector, self).__init__()
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector", self.uid)
def setDictPath(self, value):
return self._set(dictPath=value)
diff --git a/python/test/annotators.py b/python/test/annotators.py
index e42745ffb7438b..21051f72d5dd25 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -16,8 +16,9 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer()\
- .setOutputCol("token")
+ tokenizer = Tokenizer()\
+ .setOutputCol("token") \
+ .setCompositeTokens(["New York"])
stemmer = Stemmer() \
.setInputCols(["token"]) \
.setOutputCol("stem")
@@ -29,7 +30,8 @@ def runTest(self):
.setOutputCol("assembled")
finisher = Finisher() \
.setInputCols(["assembled"]) \
- .setOutputCols(["reassembled_view"])
+ .setOutputCols(["reassembled_view"]) \
+ .setCleanAnnotations(True)
assembled = document_assembler.transform(self.data)
tokenized = tokenizer.transform(assembled)
stemmed = stemmer.transform(tokenized)
@@ -65,7 +67,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
@@ -85,7 +87,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
lemmatizer = Normalizer() \
.setInputCols(["token"]) \
@@ -121,7 +123,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
entity_extractor = EntityExtractor() \
.setOutputCol("entity")
@@ -139,10 +141,10 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
pos_tagger = PerceptronApproach() \
@@ -166,7 +168,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence") \
.setCustomBounds(["%%"])
@@ -183,17 +185,17 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- sentence_detector = SentenceDetectorModel() \
+ sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
.setOutputCol("lemma") \
.setDictionary({"missed": "miss"})
- sentiment_detector = SentimentDetectorModel() \
+ sentiment_detector = SentimentDetector() \
.setInputCols(["lemma", "sentence"]) \
.setOutputCol("sentiment") \
.setDictPath("../src/test/resources/sentiment-corpus/default-sentiment-dict.txt")
@@ -213,7 +215,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
@@ -248,7 +250,7 @@ def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
- tokenizer = RegexTokenizer() \
+ tokenizer = Tokenizer() \
.setOutputCol("token")
spell_checker = NorvigSweetingApproach() \
.setInputCols(["token"]) \
diff --git a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
index d233cd09853292..cf5d925b5081da 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
@@ -10,11 +10,11 @@ import scala.collection.Map
/**
* represents annotator's output parts and their details
* @param annotatorType the type of annotation
- * @param begin the index of the first character under this annotation
+ * @param start the index of the first character under this annotation
* @param end the index after the last character under this annotation
* @param metadata associated metadata for this annotation
*/
-case class Annotation(annotatorType: String, begin: Int, end: Int, result: String, metadata: Map[String, String])
+case class Annotation(annotatorType: String, start: Int, end: Int, result: String, metadata: Map[String, String])
object Annotation {
@@ -39,7 +39,7 @@ object Annotation {
/** This is spark type of an annotation representing its metadata shape */
val dataType = new StructType(Array(
StructField("annotatorType", StringType, nullable = true),
- StructField("begin", IntegerType, nullable = false),
+ StructField("start", IntegerType, nullable = false),
StructField("end", IntegerType, nullable = false),
StructField("result", StringType, nullable = true),
StructField("metadata", MapType(StringType, StringType), nullable = true)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala b/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
index d713366ae16eac..ab6c6035046a51 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
@@ -26,10 +26,10 @@ trait HasFeatures {
protected def get[T](feature: StructFeature[T]): Option[T] = feature.get
- protected def $$[T](feature: ArrayFeature[T]): Array[T] = feature.getValue
+ protected def $$[T](feature: ArrayFeature[T]): Array[T] = feature.getOrDefault
- protected def $$[K, V](feature: MapFeature[K, V]): Map[K, V] = feature.getValue
+ protected def $$[K, V](feature: MapFeature[K, V]): Map[K, V] = feature.getOrDefault
- protected def $$[T](feature: StructFeature[T]): T = feature.getValue
+ protected def $$[T](feature: StructFeature[T]): T = feature.getOrDefault
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
index aac623b487c02d..9e71fc2884e9dd 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
@@ -11,7 +11,7 @@ class FeaturesWriter[T](annotatorWithFeatures: HasFeatures, baseWriter: MLWriter
baseWriter.save(path)
for (feature <- annotatorWithFeatures.features) {
- feature.serializeInfer(sparkSession, path, feature.name, feature.getValue)
+ feature.serializeInfer(sparkSession, path, feature.name, feature.getOrDefault)
}
onWritten(path, sparkSession)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
index a8f9a4ed949f0a..f6011b31ad477f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
@@ -21,7 +21,7 @@ class TokenAssembler(override val uid: String) extends AnnotatorModel[TokenAssem
.map{case (_, sentenceAnnotations) =>
Annotation(
DOCUMENT,
- sentenceAnnotations.minBy(_.begin).begin,
+ sentenceAnnotations.minBy(_.start).start,
sentenceAnnotations.maxBy(_.end).end,
sentenceAnnotations.map(_.result).mkString(" "),
Map.empty[String, String]
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
index fe2ad96a5ff242..0273cf9a6ab24a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
@@ -79,7 +79,7 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt
private def loadEntities(): Unit = {
val src = EntityExtractor.retrieveEntityExtractorPhrases($(entitiesPath), $(entitiesFormat))
- val tokenizer = new RegexTokenizer().setPattern("\\w+")
+ val tokenizer = new Tokenizer()
val normalizer = new Normalizer()
val phrases: Array[Array[String]] = src.map {
line =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
index 0c1863b0d33fa6..1dd0739bfff007 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
@@ -78,7 +78,7 @@ class Lemmatizer(override val uid: String) extends AnnotatorModel[Lemmatizer] {
val token = tokenAnnotation.result
Annotation(
annotatorType,
- tokenAnnotation.begin,
+ tokenAnnotation.start,
tokenAnnotation.end,
$$(lemmaDict).getOrElse(token, token),
tokenAnnotation.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
index ea9b743406f0f8..ca83ae0f6c3d8c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
@@ -19,7 +19,7 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space")
val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")
- setDefault(pattern, "[^a-zA-Z]")
+ setDefault(pattern, "[^\\pL+]")
setDefault(lowercase, true)
def getPattern: String = $(pattern)
@@ -44,7 +44,7 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
.trim
Annotation(
annotatorType,
- token.begin,
+ token.start,
token.end,
nToken,
token.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala
deleted file mode 100644
index 8a51b268cdcd12..00000000000000
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-package com.johnsnowlabs.nlp.annotators
-
-import com.johnsnowlabs.nlp.annotators.common._
-import org.apache.spark.ml.param.Param
-import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
-import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
-
-import scala.util.matching.Regex
-
-/**
- * Tokenizes raw text into word pieces, tokens.
- * @param uid required uid for storing annotator to disk
- * @@ pattern: RegexPattern to split phrases into tokens
- */
-class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexTokenizer] {
-
- import com.johnsnowlabs.nlp.AnnotatorType._
-
- val pattern: Param[String] = new Param(this, "pattern", "this is the token pattern")
-
- lazy val regex: Regex = $(pattern).r
-
- override val annotatorType: AnnotatorType = TOKEN
-
- def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
-
- def setPattern(value: String): this.type = set(pattern, value)
-
- def getPattern: String = $(pattern)
-
- setDefault(inputCols, Array(DOCUMENT))
-
- /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */
- override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
-
- setDefault(pattern, "\\S+")
-
- def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
- sentences.map{text =>
- val tokens = regex.findAllMatchIn(text.content).map { m =>
- IndexedToken(m.matched, text.begin + m.start, text.begin + m.end - 1)
- }.toArray
- TokenizedSentence(tokens)
- }
- }
-
- /** one to many annotation */
- override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
- val sentences = SentenceSplit.unpack(annotations)
- val tokenized = tag(sentences)
- Tokenized.pack(tokenized)
- }
-}
-
-object RegexTokenizer extends DefaultParamsReadable[RegexTokenizer]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
index b70e82490e2775..833ca16c5cbb45 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
@@ -37,7 +37,7 @@ class Stemmer(override val uid: String) extends AnnotatorModel[Stemmer] {
val stem = EnglishStemmer.stem(tokenAnnotation.result)
Annotation(
annotatorType,
- tokenAnnotation.begin,
+ tokenAnnotation.start,
tokenAnnotation.end,
stem,
tokenAnnotation.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
new file mode 100644
index 00000000000000..4f4e63a65302b0
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -0,0 +1,154 @@
+package com.johnsnowlabs.nlp.annotators
+
+import com.johnsnowlabs.nlp.annotators.common._
+import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
+import org.apache.spark.ml.param.{Param, StringArrayParam}
+import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * Tokenizes raw text into word pieces, tokens.
+ * @param uid required uid for storing annotator to disk
+ * @@ pattern: RegexPattern to split phrases into tokens
+ */
+class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
+
+ import com.johnsnowlabs.nlp.AnnotatorType._
+
+ val compositeTokens: StringArrayParam = new StringArrayParam(this, "compositeTokens", "Words that won't be split in two")
+ val targetPattern: Param[String] = new Param(this, "targetPattern", "pattern to grab from text as token candidates. Defaults \\S+")
+ val infixPatterns: StringArrayParam = new StringArrayParam(this, "infixPattern", "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults")
+ val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\p{L}$\\.]*)")
+ val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\p{L}]?)([^\\s\\p{L}]*)\\z")
+
+ override val annotatorType: AnnotatorType = TOKEN
+
+ /** A Tokenizer could require only for now a SentenceDetector annotator */
+ override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
+
+ def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
+
+ def setTargetPattern(value: String): this.type = set(targetPattern, value)
+
+ def setExtensionPatterns(value: Array[String]): this.type = set(infixPatterns, value)
+
+ def addInfixPattern(value: String): this.type = set(infixPatterns, value +: $(infixPatterns))
+
+ def setPrefixPattern(value: String): this.type = set(prefixPattern, value)
+
+ def setSuffixPattern(value: String): this.type = set(suffixPattern, value)
+
+ def setCompositeTokens(value: Array[String]): this.type = set(compositeTokens, value)
+
+ def getCompositeTokens: Array[String] = $(compositeTokens)
+
+ def getInfixPatterns: Array[String] = $(infixPatterns)
+
+ def getPrefixPattern: String = $(prefixPattern)
+
+ def getSuffixPattern: String = $(suffixPattern)
+
+ def getTargetPattern: String = $(targetPattern)
+
+ setDefault(inputCols, Array(DOCUMENT))
+
+ /** Clears out rules and constructs a new rule for every combination of rules provided */
+ /** The strategy is to catch one token per regex group */
+ /** User may add its own groups if needs targets to be tokenized separately from the rest */
+ lazy private val ruleFactory = {
+ val rules = ArrayBuffer.empty[String]
+ require($(infixPatterns).nonEmpty)
+ require($(infixPatterns).forall(ip => ip.contains("(") && ip.contains(")")),
+ "infix patterns must use regex group. Notice each group will result in separate token")
+ $(infixPatterns).foreach(ip => {
+ val rule = new StringBuilder
+ get(prefixPattern).orElse(getDefault(prefixPattern)).foreach(pp => {
+ require(pp.startsWith("\\A"), "prefixPattern must begin with \\A to ensure it is the beginning of the string")
+ require(pp.contains("(") && pp.contains(")"), "prefixPattern must contain regex groups. Each group will return in separate token")
+ rule.append(pp)
+ })
+ rule.append(ip)
+ get(suffixPattern).orElse(getDefault(suffixPattern)).foreach(sp => {
+ require(sp.endsWith("\\z"), "suffixPattern must end with \\z to ensure it is the end of the string")
+ require(sp.contains("(") && sp.contains(")"), "suffixPattern must contain regex groups. Each group will return in separate token")
+ rule.append(sp)
+ })
+ rules.append(rule.toString)
+ })
+ rules.foldLeft(new RuleFactory(MatchStrategy.MATCH_FIRST))((factory, rule) => factory.addRule(rule.r, rule))
+ }
+
+ /** Check here for explanation on this default pattern */
+ setDefault(infixPatterns, Array(
+ "((?:\\p{L}+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
+ "(\\p{L}+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
+ "(\\p{L}+)('{1}\\p{L}+)", // http://rubular.com/r/N84PYwYjQp
+ "((?:\\p{L}+[^\\s\\p{L}]{1})+\\p{L}+)", // http://rubular.com/r/wOvQcey9e3
+ "(\\p{L}+)" // basic word token
+ ))
+ /** These catch everything before and after a word, as a separate token*/
+ setDefault(prefixPattern, "\\A([^\\s\\p{L}$\\.]*)")
+ setDefault(suffixPattern, "([^\\s\\p{L}]?)([^\\s\\p{L}]*)\\z")
+ setDefault(targetPattern, "\\S+")
+
+ private val PROTECT_STR = "ↈ"
+
+ def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
+ sentences.map{text =>
+ /** Step 1, protect exception words from being broken*/
+ var protected_text = text.content
+ if (get(compositeTokens).isDefined) {
+ $(compositeTokens).foreach(tokenException =>
+ protected_text = protected_text.replaceAll(
+ tokenException,
+ tokenException.replaceAll("[^(?:" + $(targetPattern) + ")]", PROTECT_STR)
+ )
+ )
+ }
+ /** Step 2, Return protected exception tokens back into text and move on*/
+ val tokens = $(targetPattern).r.findAllMatchIn(protected_text).flatMap { candidate =>
+ if (get(compositeTokens).isDefined && candidate.matched.contains(PROTECT_STR)) {
+ /** Put back character and move on */
+ Seq(IndexedToken(
+ text.content.slice(text.start + candidate.start, text.start + candidate.end),
+ text.start + candidate.start,
+ text.start + candidate.end - 1
+ ))
+ }
+ else {
+ /** Step 3, If no exception found, find candidates through the possible general rule patterns*/
+ ruleFactory.findMatchFirstOnly(candidate.matched).map {m =>
+ var curPos = m.content.start
+ (1 to m.content.groupCount)
+ .map (i => {
+ val target = m.content.group(i)
+ val it = IndexedToken(
+ target,
+ text.start + candidate.start + curPos,
+ text.start + candidate.start + curPos + target.length - 1
+ )
+ curPos += target.length
+ it
+ })
+ /** Step 4, If rules didn't match, return whatever candidate we have and leave it as is*/
+ }.getOrElse(Seq(IndexedToken(
+ candidate.matched,
+ text.start + candidate.start,
+ text.start + candidate.end - 1
+ )))
+ }}.toArray.filter(t => t.token.nonEmpty)
+ TokenizedSentence(tokens)
+ }
+ }
+
+ /** one to many annotation */
+ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
+ val sentences = SentenceSplit.unpack(annotations)
+ val tokenized = tag(sentences)
+ Tokenized.pack(tokenized)
+ }
+}
+
+object Tokenizer extends DefaultParamsReadable[Tokenizer]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
index 2df2d514070227..3beb73cb97e217 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
@@ -15,7 +15,7 @@ object DependencyParsed extends Annotated[DependencyParsedSentence]{
val sentences = Tokenized.unpack(annotations)
val depAnnotations = annotations
.filter(a => a.annotatorType == annotatorType)
- .sortBy(a => a.begin)
+ .sortBy(a => a.start)
var last = 0
sentences.map{sentence =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
index 0a2d16b5aaa7b9..badb8644ffb316 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
@@ -5,7 +5,7 @@ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
/**
* structure representing a sentence and its boundaries
*/
-case class Sentence(content: String, begin: Int, end: Int)
+case class Sentence(content: String, start: Int, end: Int)
object Sentence {
def fromTexts(texts: String*): Seq[Sentence] = {
@@ -27,11 +27,11 @@ object SentenceSplit extends Annotated[Sentence] {
override def unpack(annotations: Seq[Annotation]): Seq[Sentence] = {
annotations.filter(_.annotatorType == annotatorType)
.map(annotation =>
- Sentence(annotation.result, annotation.begin, annotation.end)
+ Sentence(annotation.result, annotation.start, annotation.end)
)
}
override def pack(items: Seq[Sentence]): Seq[Annotation] = {
- items.map(item => Annotation(annotatorType, item.begin, item.end, item.content, Map.empty[String, String]))
+ items.map(item => Annotation(annotatorType, item.start, item.end, item.content, Map.empty[String, String]))
}
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
index b64c5fb33bfc81..629a8099197544 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
@@ -15,17 +15,17 @@ trait Tagged[T >: TaggedSentence <: TaggedSentence] extends Annotated[T] {
val tokenized = Tokenized.unpack(annotations)
val tagAnnotations = annotations
.filter(a => a.annotatorType == annotatorType)
- .sortBy(a => a.begin)
+ .sortBy(a => a.start)
.toIterator
var annotation: Option[Annotation] = None
tokenized.map { sentence =>
val tokens = sentence.indexedTokens.map { token =>
- while (tagAnnotations.hasNext && (annotation.isEmpty || annotation.get.begin < token.begin))
+ while (tagAnnotations.hasNext && (annotation.isEmpty || annotation.get.start < token.begin))
annotation = Some(tagAnnotations.next)
- val tag = if (annotation.isDefined && annotation.get.begin == token.begin)
+ val tag = if (annotation.isDefined && annotation.get.start == token.begin)
annotation.get.result
else
emptyTag
@@ -69,7 +69,7 @@ trait Tagged[T >: TaggedSentence <: TaggedSentence] extends Annotated[T] {
}
protected def getLabels(sentences: Seq[TaggedSentence], labelAnnotations: Seq[Annotation]): Seq[TextSentenceLabels] = {
- val position2Tag = labelAnnotations.map(a => (a.begin, a.end) -> a.result).toMap
+ val position2Tag = labelAnnotations.map(a => (a.start, a.end) -> a.result).toMap
sentences.map{sentence =>
val labels = sentence.indexedTaggedWords.map { w =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
index e02bec3e79e339..c1128c5be7c976 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
@@ -13,8 +13,8 @@ object Tokenized extends Annotated[TokenizedSentence] {
SentenceSplit.unpack(annotations).map(sentence => {
tokens.filter(token =>
- token.begin >= sentence.begin & token.end <= sentence.end
- ).map(token => IndexedToken(token.result, token.begin, token.end))
+ token.start >= sentence.start & token.end <= sentence.end
+ ).map(token => IndexedToken(token.result, token.start, token.end))
}).filter(_.nonEmpty).map(indexedTokens => TokenizedSentence(indexedTokens))
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala
index 795ccf0b4e76f3..be39a481839b68 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala
@@ -3,11 +3,11 @@ package com.johnsnowlabs.nlp.annotators.ner.crf
import com.johnsnowlabs.ml.crf.{CrfParams, LinearChainCrf, TextSentenceLabels, Verbose}
import com.johnsnowlabs.nlp.{AnnotatorType, DocumentAssembler, HasRecursiveFit, RecursivePipeline}
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, NAMED_ENTITY, POS, TOKEN}
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.datasets.CoNLL
import com.johnsnowlabs.nlp.embeddings.ApproachWithWordEmbeddings
import org.apache.spark.ml.{Pipeline, PipelineModel}
@@ -88,19 +88,19 @@ class NerCrfApproach(override val uid: String)
return recursivePipeline.get.transform(dataframe)
}
- logger.warn("NER CRF not in a RecursivePipeline." +
- "It is recommended to use a com.jonsnowlabs.nlp.RecursivePipeline for" +
+ logger.warn("NER CRF not in a RecursivePipeline. " +
+ "It is recommended to use a com.jonsnowlabs.nlp.RecursivePipeline for " +
"better performance during training")
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setCustomBoundChars(Array("\n\n"))
.setInputCols(Array("document"))
.setOutputCol("sentence")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala
similarity index 82%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala
index 29ef30f1fb03be..ca24eb703a29e1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala
@@ -7,7 +7,7 @@ import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
-class DependencyParser(override val uid: String) extends AnnotatorApproach[DependencyParserModel] {
+class DependencyParserApproach(override val uid: String) extends AnnotatorApproach[DependencyParserModel] {
override val description: String = "Dependency Parser Estimator used to train"
def this() = this(Identifiable.randomUID(DEPENDENCY))
@@ -26,4 +26,4 @@ class DependencyParser(override val uid: String) extends AnnotatorApproach[Depen
}
}
-object DependencyParser extends DefaultParamsReadable[DependencyParser]
\ No newline at end of file
+object DependencyParserApproach extends DefaultParamsReadable[DependencyParserApproach]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
index ca0432788c411b..d78b133a18cd89 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
@@ -199,7 +199,7 @@ class PragmaticContentFormatter(text: String) {
val factory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
// http://rubular.com/r/G2opjedIm9
//special periods
- .addRule(new RegexRule("http://rubular.com/r/G2opjedIm9", "formatGeo"))
+ .addRule(new RegexRule("(?<=[a-zA-z]°)\\.(?=\\s*\\d+)", "formatGeo"))
wip = factory.transformWithSymbol(MULT_PERIOD, wip)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
similarity index 91%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
index 0612c7d5dfa723..e115cc9d50e5a1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
@@ -10,7 +10,7 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
* @param uid internal constructor requirement for serialization of params
* @@ model: Model to use for boundaries detection
*/
-class SentenceDetectorModel(override val uid: String) extends AnnotatorModel[SentenceDetectorModel] {
+class SentenceDetector(override val uid: String) extends AnnotatorModel[SentenceDetector] {
import com.johnsnowlabs.nlp.AnnotatorType._
@@ -56,4 +56,4 @@ class SentenceDetectorModel(override val uid: String) extends AnnotatorModel[Sen
}
}
-object SentenceDetectorModel extends DefaultParamsReadable[SentenceDetectorModel]
\ No newline at end of file
+object SentenceDetector extends DefaultParamsReadable[SentenceDetector]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala
index ef266d2ac2f27c..9a1e2add1dd9c1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala
@@ -103,6 +103,6 @@ object PragmaticScorer {
new PragmaticScorer(javaSentimentDict.asScala.toMap)
}
def fromPath(overridePath: String, sentFormat: String, sentSeparator: String) {
- new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict(overridePath, sentFormat.toUpperCase, sentSeparator))
+ new PragmaticScorer(SentimentDetector.retrieveSentimentDict(overridePath, sentFormat.toUpperCase, sentSeparator))
}
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
similarity index 79%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
index 990d690fd519fb..0fa6362eb45650 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
@@ -16,7 +16,7 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
* @param uid internal uid needed for saving annotator to disk
* @@ model: Implementation to be applied for sentiment analysis
*/
-class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[SentimentDetectorModel] {
+class SentimentDetector(override val uid: String) extends AnnotatorModel[SentimentDetector] {
import com.johnsnowlabs.nlp.AnnotatorType._
@@ -36,7 +36,7 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se
setDefault(dictSeparator, config.getString("nlp.sentimentDict.separator"))
lazy val model: PragmaticScorer =
- new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict($(dictPath), $(dictFormat), $(dictSeparator)))
+ new PragmaticScorer(SentimentDetector.retrieveSentimentDict($(dictPath), $(dictFormat), $(dictSeparator)))
override val annotatorType: AnnotatorType = SENTIMENT
@@ -44,29 +44,17 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se
def this() = this(Identifiable.randomUID("SENTIMENT"))
- def setDictPath(path: String): this.type = {
- set(dictPath, path)
- }
+ def setDictPath(path: String): this.type = set(dictPath, path)
- def getDictPath: String = {
- $(dictPath)
- }
+ def getDictPath: String = $(dictPath)
- def setDictFormat(format: String): this.type = {
- set(dictFormat, format)
- }
+ def setDictFormat(format: String): this.type = set(dictFormat, format)
- def getDictFormat: String = {
- $(dictFormat)
- }
+ def getDictFormat: String = $(dictFormat)
- def setDictSeparator(separator: String): this.type = {
- set(dictSeparator, separator)
- }
+ def setDictSeparator(separator: String): this.type = set(dictSeparator, separator)
- def getDictSeparator: String = {
- $(dictSeparator)
- }
+ def getDictSeparator: String = $(dictSeparator)
/**
* Tokens are needed to identify each word in a sentence boundary
@@ -91,7 +79,7 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se
}
}
-object SentimentDetectorModel extends DefaultParamsReadable[SentimentDetectorModel] {
+object SentimentDetector extends DefaultParamsReadable[SentimentDetector] {
/**
* Sentiment dictionaries from compiled sources set in configuration
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
index e12e0899eedb64..2f3814196d63e7 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
@@ -237,7 +237,7 @@ class NorvigSweetingModel(override val uid: String) extends AnnotatorModel[Norvi
annotations.map { token =>
Annotation(
annotatorType,
- token.begin,
+ token.start,
token.end,
check(token.result),
token.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index f3a251f533423b..404e3a948bb86d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -2,23 +2,28 @@ package com.johnsnowlabs.nlp.serialization
import com.johnsnowlabs.nlp.HasFeatures
import com.johnsnowlabs.nlp.util.ConfigHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import scala.reflect.ClassTag
-abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model: HasFeatures, val name: String)(implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate()) extends Serializable {
+abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model: HasFeatures, val name: String) extends Serializable {
model.features.append(this)
private val config = ConfigHelper.retrieve
+ private val spark = ResourceHelper.spark
val serializationMode: String = config.getString("performance.serialization")
val useBroadcast: Boolean = config.getBoolean("performance.useBroadcast")
final protected var broadcastValue: Option[Broadcast[TComplete]] = None
+
final protected var rawValue: Option[TComplete] = None
- final protected var fallback: Option[() => TComplete] = None
+ final protected var fallbackRawValue: Option[TComplete] = None
+
+ final protected var fallbackLazyValue: Option[() => TComplete] = None
final def serialize(spark: SparkSession, path: String, field: String, value: TComplete): Unit = {
serializationMode match {
@@ -52,26 +57,35 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
final protected def getFieldPath(path: String, field: String): Path =
Path.mergePaths(new Path(path), new Path("/fields/" + field))
+ private def callAndSetFallback: Option[TComplete] = {
+ fallbackRawValue = fallbackLazyValue.map(_())
+ fallbackRawValue
+ }
+
final def get: Option[TComplete] = {
broadcastValue.map(_.value).orElse(rawValue)
}
- final def getValue: TComplete = {
- broadcastValue.map(_.value).orElse(rawValue).orElse(fallback.map(_())).getOrElse(throw new Exception(s"feature $name is not set"))
+ final def getOrDefault: TComplete = {
+ broadcastValue.map(_.value)
+ .orElse(rawValue)
+ .orElse(fallbackRawValue)
+ .orElse(callAndSetFallback)
+ .getOrElse(throw new Exception(s"feature $name is not set"))
}
- final def setValue(v: Option[Any]): HasFeatures = {
+ final def setValue(value: Option[Any]): HasFeatures = {
if (useBroadcast) {
if (isSet) broadcastValue.get.destroy()
- broadcastValue = Some(sparkSession.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
+ broadcastValue = value.map(v => spark.sparkContext.broadcast[TComplete](v.asInstanceOf[TComplete]))
} else {
- rawValue = Some(v.get.asInstanceOf[TComplete])
+ rawValue = Some(value.get.asInstanceOf[TComplete])
}
model
}
def setFallback(v: Option[() => TComplete]): HasFeatures = {
- fallback = v
+ fallbackLazyValue = v
model
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
index 740b19881a9398..26eb40187b135a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
@@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.util.io
import java.io.{File, FileNotFoundException, InputStream}
-import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer}
+import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer}
import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher}
import com.johnsnowlabs.nlp.util.io.ResourceFormat._
import org.apache.spark.ml.Pipeline
@@ -367,10 +367,10 @@ object ResourceHelper {
val wordCount = MMap.empty[String, Int].withDefaultValue(0)
val documentAssembler = new DocumentAssembler()
.setInputCol("value")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")
- .setPattern(tokenPattern)
+ .setTargetPattern(tokenPattern)
val normalizer = new Normalizer()
.setInputCols("token")
.setOutputCol("normal")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala
index df81da027773fd..cb3428feafbb55 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala
@@ -7,7 +7,7 @@ import scala.util.matching.Regex
* @param rx a java.matching.Regex object
* @param identifier some description that might help link the regex to its meaning
*/
-class RegexRule(rx: Regex, val identifier: String) {
+class RegexRule(rx: Regex, val identifier: String) extends Serializable {
def this(rx: String, identifier: String) {
this(rx.r, identifier)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
index 38ea929b48ebdb..70c2ac5cafbcda 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
@@ -16,7 +16,7 @@ import scala.util.matching.Regex
*/
class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM)
- extends RuleSymbols {
+ extends RuleSymbols with Serializable {
/**
* Internal representation of a regex match
@@ -28,16 +28,15 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
import TransformStrategy._
import MatchStrategy._
- val logger = LoggerFactory.getLogger("RuleFactory")
+ /** Helper functions to identify context in a word for debugging */
+ private val logger = LoggerFactory.getLogger("RuleFactory")
+ private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else 0
+ private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength
/** Rules and SymbolRules are key pieces of regex transformation */
private var rules: Seq[RegexRule] = Seq()
private var symbolRules: Seq[(String, RegexRule)] = Seq()
- /** Helper functions to identify context in a word for debugging */
- private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else 0
- private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength
-
/** Adds a rule to this factory*/
def addRule(rule: RegexRule): this.type = {
rules = rules :+ rule
@@ -50,6 +49,91 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
this
}
+ def clearRules(): this.type = {
+ rules = Seq.empty[RegexRule]
+ this
+ }
+
+ /** Shortcut functions, no need to execute them on runtime since a strategy won't change in lifetime of Factory */
+ private val findMatchFunc = (text: String) => matchStrategy match {
+ case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
+ case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
+ case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier)))
+ }
+
+ private val transformMatchFunc = (text: String, regex: Regex, transform: Regex.Match => String) => matchStrategy match {
+ case MATCH_ALL => regex.replaceAllIn(text, transform)
+ case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text)
+ case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m =>
+ regex.replaceFirstIn(text, transform(m))).getOrElse(text)
+ case _ => throw new IllegalArgumentException("Invalid match strategy")
+ }
+
+ private val transformWithSymbolFunc = (symbol: String, text: String) => transformStrategy match {
+ case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
+ () => rule.identifier,
+ () => APPEND_WITH_SYMBOL)
+ "$0" + symbol
+ }))
+ case PREPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
+ () => rule.identifier,
+ () => PREPEND_WITH_SYMBOL)
+ symbol + "$0"
+ }))
+ case REPLACE_ALL_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+ () => rule.identifier,
+ () => REPLACE_ALL_WITH_SYMBOL)
+ symbol
+ }))
+ case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+ () => rule.identifier,
+ () => REPLACE_WITH_SYMBOL_AND_BREAK)
+ symbol + BREAK_INDICATOR
+ }))
+ case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
+ }
+
+ private val transformWithSymbolicRulesFunc = (text: String) => transformStrategy match {
+ case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldRight(text)((rule, target) => transformMatch(target, rule._2.regex)({ m =>
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+ () => rule._2.identifier,
+ () => REPLACE_EACH_WITH_SYMBOL)
+ rule._1
+ }))
+ case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldRight(text)((rule, target) => rule._2.regex replaceAllIn(
+ target, m => {
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+ () => rule._2.identifier,
+ () => REPLACE_EACH_WITH_SYMBOL_AND_BREAK)
+ rule._1 + BREAK_INDICATOR
+ }))
+ case PROTECT_FROM_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+ logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+ () => m.matched,
+ () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+ () => rule.identifier,
+ () => PROTECT_FROM_BREAK)
+ PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE
+ }))
+ case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
+ }
+
/**
* Adds a rule and its associated symbol to apply some transformation using such symbol
* @param symbol symbol is a character to be used in a transformation application, where many rules can apply different transformations
@@ -75,11 +159,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
/**Applies factory match strategy to find matches and returns any number of Matches*/
def findMatch(text: String): Seq[RuleMatch] = {
- matchStrategy match {
- case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
- case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
- case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier)))
- }
+ findMatchFunc(text)
}
/** Specifically finds a first match within a group of matches */
@@ -93,13 +173,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
* @return Resulting transformation
*/
private def transformMatch(text: String, regex: Regex)(transform: Regex.Match => String): String = {
- matchStrategy match {
- case MATCH_ALL => regex.replaceAllIn(text, transform)
- case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text)
- case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m =>
- regex.replaceFirstIn(text, transform(m))).getOrElse(text)
- case _ => throw new IllegalArgumentException("Invalid match strategy")
- }
+ transformMatchFunc(text: String, regex: Regex, transform: Regex.Match => String)
}
/**
@@ -109,41 +183,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
* @return
*/
def transformWithSymbol(symbol: String, text: String): String = {
- transformStrategy match {
- case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
- () => rule.identifier,
- () => APPEND_WITH_SYMBOL)
- "$0" + symbol
- }))
- case PREPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
- () => rule.identifier,
- () => PREPEND_WITH_SYMBOL)
- symbol + "$0"
- }))
- case REPLACE_ALL_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
- () => rule.identifier,
- () => REPLACE_ALL_WITH_SYMBOL)
- symbol
- }))
- case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
- () => rule.identifier,
- () => REPLACE_WITH_SYMBOL_AND_BREAK)
- symbol + BREAK_INDICATOR
- }))
- case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
- }
+ transformWithSymbolFunc(symbol, text)
}
/**
@@ -152,34 +192,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
* @return Returns a transformed text
*/
def transformWithSymbolicRules(text: String): String = {
- transformStrategy match {
- case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldRight(text)((rule, target) => transformMatch(target, rule._2.regex)({ m =>
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
- () => rule._2.identifier,
- () => REPLACE_EACH_WITH_SYMBOL)
- rule._1
- }))
- case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldRight(text)((rule, target) => rule._2.regex replaceAllIn(
- target, m => {
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
- () => rule._2.identifier,
- () => REPLACE_EACH_WITH_SYMBOL_AND_BREAK)
- rule._1 + BREAK_INDICATOR
- }))
- case PROTECT_FROM_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
- logger.debug("Matched: {} from: {} using rule {} with strategy {}",
- () => m.matched,
- () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
- () => rule.identifier,
- () => PROTECT_FROM_BREAK)
- PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE
- }))
- case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
- }
+ transformWithSymbolicRulesFunc(text)
}
}
object RuleFactory {
diff --git a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala
index 5d2c5a2fe0a561..d103c1f89b122f 100644
--- a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala
+++ b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala
@@ -1,12 +1,12 @@
package com.johnsnowlabs.ml.crf
import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.common.Annotated.{NerTaggedSentence, PosTaggedSentence}
import com.johnsnowlabs.nlp.annotators.common.{NerTagged, PosTagged, TaggedSentence}
import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.datasets.CoNLL
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat
import org.apache.spark.ml.{PipelineModel, PipelineStage}
@@ -30,12 +30,12 @@ object CoNLL2003PipelineTest extends App {
.setInputCol("text")
.setOutputCol("document")
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setCustomBoundChars(Array("\n\n"))
.setInputCols(Array("document"))
.setOutputCol("sentence")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
index 30fc4466f57010..03167277af7ba1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
@@ -103,13 +103,13 @@ class AnnotatorBaseTestSpec extends FlatSpec {
val contentMeta = result.select("demand", "result").take(1).head.getSeq[Row](0)
val contentAnnotation = contentMeta.map(Annotation(_)).head
assert(contentAnnotation.annotatorType == dummyAnnotator.annotatorType)
- assert(contentAnnotation.begin == 0)
+ assert(contentAnnotation.start == 0)
assert(contentAnnotation.end == 25)
assert(contentAnnotation.metadata.contains("a") && contentAnnotation.metadata("a") == "b")
val demandContentMeta = result.select("demand", "result").take(1).head.getSeq[Row](1)
val demandContentAnnotation = demandContentMeta.map(Annotation(_)).head
assert(demandContentAnnotation.annotatorType == demandingDummyAnnotator.annotatorType)
- assert(demandContentAnnotation.begin == 11)
+ assert(demandContentAnnotation.start == 11)
assert(demandContentAnnotation.end == 18)
assert(demandContentAnnotation.metadata.contains("aa") && demandContentAnnotation.metadata("aa") == "bb")
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
index 0e67f45913aa83..6a7ce1197a81bf 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
@@ -2,10 +2,10 @@ package com.johnsnowlabs.nlp
import com.johnsnowlabs.nlp.annotators._
import com.johnsnowlabs.nlp.annotators.ner.crf.{NerCrfApproach, NerCrfModel}
-import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParser
+import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector
import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach
import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat
@@ -26,7 +26,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
}
def withTokenizer(dataset: Dataset[Row]): Dataset[Row] = {
- val regexTokenizer = new RegexTokenizer()
+ val regexTokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
regexTokenizer.transform(withFullPragmaticSentenceDetector(dataset))
@@ -75,7 +75,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
}
def withFullPragmaticSentenceDetector(dataset: Dataset[Row]): Dataset[Row] = {
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")
sentenceDetector.transform(dataset)
@@ -109,7 +109,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
}
def withPragmaticSentimentDetector(dataset: Dataset[Row]): Dataset[Row] = {
- val sentimentDetector = new SentimentDetectorModel
+ val sentimentDetector = new SentimentDetector
sentimentDetector
.setInputCols(Array("token", "sentence"))
.setOutputCol("sentiment")
@@ -139,7 +139,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
def withDependencyParser(dataset: Dataset[Row]): Dataset[Row] = {
val df = withFullPOSTagger(withTokenizer(dataset))
- new DependencyParser()
+ new DependencyParserApproach()
.setInputCols(Array("sentence", "pos", "token"))
.setOutputCol("dependency")
.setSourcePath("src/test/resources/models/dep-model.txt")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
index 448fae3b41de92..326fed4adc2ae1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
@@ -18,7 +18,7 @@ class DocumentAssemblerTestSpec extends FlatSpec {
"A DocumentAssembler" should "annotate with the correct indexes" in {
val f = fixture
- f.text.head should equal (f.text(f.assembledDoc.head.begin))
+ f.text.head should equal (f.text(f.assembledDoc.head.start))
f.text.last should equal (f.text(f.assembledDoc.head.end))
}
}
\ No newline at end of file
diff --git a/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala
index 0e09cd6313f09c..99c8f306a69f45 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala
@@ -1,6 +1,6 @@
package com.johnsnowlabs.nlp
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.StopWordsRemover
import org.scalatest._
@@ -14,7 +14,7 @@ class FinisherTestSpec extends FlatSpec {
.setInputCol("text")
.setOutputCol("document")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala
index 0ed89436902aa4..92ab5cf6aca237 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala
@@ -1,6 +1,6 @@
package com.johnsnowlabs.nlp.annotators
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.types.ArrayType
@@ -43,11 +43,11 @@ class LemmatizerTestSpec extends FlatSpec with LemmatizerBehaviors {
.setInputCol("text")
.setOutputCol("document")
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
index 0572a63009172e..57f7ec8d50024f 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
@@ -27,17 +27,16 @@ trait NormalizerBehaviors { this: FlatSpec =>
AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
.collect().foreach {
row =>
- val tokens = row.getSeq[Row](3).map(Annotation(_))
+ val tokens = row.getSeq[Row](3).map(Annotation(_)).filterNot(a => a.result == "." || a.result == ",")
val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
normalizedAnnotations.foreach {
- case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
- assert(stem.result.nonEmpty, "Annotation result exists")
+ case nToken: Annotation if nToken.annotatorType == AnnotatorType.TOKEN =>
+ assert(nToken.result.nonEmpty, "Annotation result exists")
case _ =>
}
-
normalizedAnnotations.zip(tokens).foreach {
- case (stem: Annotation, token: Annotation) =>
- assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
+ case (nToken: Annotation, token: Annotation) =>
+ assert(nToken.result == token.result.replaceAll("[^a-zA-Z]", ""))
}
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala
deleted file mode 100644
index 39fc39e98cd48f..00000000000000
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-package com.johnsnowlabs.nlp.annotators
-
-import com.johnsnowlabs.nlp._
-import org.apache.spark.sql.{Dataset, Row}
-import org.scalatest._
-import java.util.Date
-
-/**
- * Created by saif on 02/05/17.
- */
-class RegexTokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
-
- val regexTokenizer = new RegexTokenizer
-
- "a RegexTokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
- assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN)
- }
-
- "a spark based tokenizer" should "resolve big data" in {
- val data = ContentProvider.parquetData.limit(500000)
- .repartition(16)
-
- val documentAssembler = new DocumentAssembler()
- .setInputCol("text")
-
- val assembled = documentAssembler.transform(data)
-
- val tokenizer = new RegexTokenizer()
- .setOutputCol("token")
- val tokenized = tokenizer.transform(assembled)
-
- val date1 = new Date().getTime
- Annotation.take(tokenized, "token", 5000)
- info(s"Collected 5000 tokens took ${(new Date().getTime - date1) / 1000} seconds")
- }
-
- val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)
-
- "A full RegexTokenizer pipeline with latin content" should behave like fullTokenizerPipeline(latinBodyData)
-
-}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
similarity index 75%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
index 2aa10425d7d7d2..415bf09e92e371 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
@@ -1,16 +1,16 @@
package com.johnsnowlabs.nlp.annotators
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder, AnnotatorType}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest._
import scala.language.reflectiveCalls
-trait RegexTokenizerBehaviors { this: FlatSpec =>
+trait TokenizerBehaviors { this: FlatSpec =>
def fixture(dataset: => Dataset[Row]) = new {
- val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withFullPragmaticSentenceDetector(dataset))
+ val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withTokenizer(dataset))
val documents = df.select("document")
val sentences = df.select("sentence")
val tokens = df.select("token")
@@ -34,16 +34,16 @@ trait RegexTokenizerBehaviors { this: FlatSpec =>
}
def fullTokenizerPipeline(dataset: => Dataset[Row]) {
- "A RegexTokenizer Annotator" should "successfully transform data" in {
+ "A Tokenizer Annotator" should "successfully transform data" in {
val f = fixture(dataset)
- assert(f.tokensAnnotations.nonEmpty, "RegexTokenizer should add annotators")
+ assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
}
it should "annotate using the annotatorType of token" in {
val f = fixture(dataset)
- assert(f.tokensAnnotations.nonEmpty, "RegexTokenizer should add annotators")
+ assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
f.tokensAnnotations.foreach { a =>
- assert(a.annotatorType == AnnotatorType.TOKEN, "RegexTokenizer annotations type should be equal to 'token'")
+ assert(a.annotatorType == AnnotatorType.TOKEN, "Tokenizer annotations type should be equal to 'token'")
}
}
@@ -51,8 +51,8 @@ trait RegexTokenizerBehaviors { this: FlatSpec =>
val f = fixture(dataset)
f.tokensAnnotations.foreach { a =>
val token = a.result
- val sentenceToken = f.corpus.slice(a.begin, a.end + 1)
- assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.begin},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
+ val sentenceToken = f.corpus.slice(a.start, a.end + 1)
+ assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.start},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
}
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
new file mode 100644
index 00000000000000..481abd7c9baa8a
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -0,0 +1,93 @@
+package com.johnsnowlabs.nlp.annotators
+
+import com.johnsnowlabs.nlp._
+import org.apache.spark.sql.{Dataset, Row}
+import org.scalatest._
+import java.util.Date
+
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import org.apache.spark.ml.Pipeline
+
+/**
+ * Created by saif on 02/05/17.
+ */
+class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
+
+ import SparkAccessor.spark.implicits._
+
+ val regexTokenizer = new Tokenizer
+
+ "a Tokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
+ assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN)
+ }
+
+
+ val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it héroe). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
+ val expected = Array(
+ "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", "héroe", ")", ".",
+ "Give", "me", "my", "horse", "!", "or", "$100", "bucks", "'", "He", "said", "'", ",", "I", "'ll", "defeat", "markus-crassus", "."
+ )
+
+ "a Tokenizer" should "correctly tokenize target text on its defaults parameters with exceptions" in {
+ val data = DataBuilder.basicDataBuild(targetText)
+ val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+ val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York"))
+ val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setCleanAnnotations(false).setOutputCols("output")
+ val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
+ val pip = pipeline.fit(data).transform(data)
+ val result = pip
+ .select("output").as[Array[String]]
+ .collect.flatten
+ assert(
+ result.sameElements(expected),
+ s"because result tokens differ: " +
+ s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
+ )
+ pip
+ .select("token").as[Array[Annotation]]
+ .collect.foreach(annotations => {
+ annotations.foreach(annotation => {
+ assert(targetText.slice(annotation.start, annotation.end + 1) == annotation.result)
+ })
+ })
+ }
+
+ "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters with exceptions" in {
+ val data = DataBuilder.basicDataBuild(targetText)
+ val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+ val sentence = new SentenceDetector().setInputCols("document").setOutputCol("sentence")
+ val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token").setCompositeTokens(Array("New York"))
+ val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+ val pipeline = new Pipeline().setStages(Array(document, sentence, tokenizer, finisher))
+ val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+ .collect.flatten
+ assert(
+ result.sameElements(expected),
+ s"because result tokens differ: " +
+ s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
+ )
+ }
+
+ "a spark based tokenizer" should "resolve big data" in {
+ val data = ContentProvider.parquetData.limit(500000)
+ .repartition(16)
+
+ val documentAssembler = new DocumentAssembler()
+ .setInputCol("text")
+
+ val assembled = documentAssembler.transform(data)
+
+ val tokenizer = new Tokenizer()
+ .setOutputCol("token")
+ val tokenized = tokenizer.transform(assembled)
+
+ val date1 = new Date().getTime
+ Annotation.take(tokenized, "token", 5000)
+ info(s"Collected 5000 tokens took ${(new Date().getTime - date1) / 1000} seconds")
+ }
+
+ val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)
+
+ "A full Tokenizer pipeline with latin content" should behave like fullTokenizerPipeline(latinBodyData)
+
+}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
index 327f84e7d80738..b72dacc0045642 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
@@ -16,14 +16,14 @@ class NerCrfApproachSpec extends FlatSpec {
nerModel.write.overwrite.save("./test_crf_pipeline")
val loadedNer = NerCrfModel.read.load("./test_crf_pipeline")
- assert(nerModel.model.getValue.serialize == loadedNer.model.getValue.serialize)
- assert(nerModel.dictionaryFeatures.getValue == loadedNer.dictionaryFeatures.getValue)
+ assert(nerModel.model.getOrDefault.serialize == loadedNer.model.getOrDefault.serialize)
+ assert(nerModel.dictionaryFeatures.getOrDefault == loadedNer.dictionaryFeatures.getOrDefault)
}
"NerCrfApproach" should "have correct set of labels" in {
assert(nerModel.model.isSet)
- val metadata = nerModel.model.getValue.metadata
+ val metadata = nerModel.model.getOrDefault.metadata
assert(metadata.labels.toSeq == Seq("@#Start", "PER", "O", "ORG", "LOC"))
}
@@ -35,7 +35,7 @@ class NerCrfApproachSpec extends FlatSpec {
assert(annotations.length == labels.length)
for ((annotation, label) <- annotations.zip(labels)) {
- assert(annotation.begin == label.begin)
+ assert(annotation.start == label.start)
assert(annotation.end == label.end)
assert(annotation.annotatorType == AnnotatorType.NAMED_ENTITY)
assert(annotation.result == label.result)
@@ -65,7 +65,7 @@ class NerCrfApproachSpec extends FlatSpec {
"NerCrfModel" should "correctly handle entities param" in {
val restrictedModel = new NerCrfModel()
.setEntities(Array("PER", "LOC"))
- .setModel(nerModel.model.getValue)
+ .setModel(nerModel.model.getOrDefault)
.setOutputCol(nerModel.getOutputCol)
.setInputCols(nerModel.getInputCols)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala
similarity index 59%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala
index e73d3b1cfaf313..879f09b16c9d36 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala
@@ -2,6 +2,6 @@ package com.johnsnowlabs.nlp.annotators.parser.dep
import org.scalatest.FlatSpec
-class DependencyParserModelTest extends FlatSpec {
+class DependencyParserApproachModelTest extends FlatSpec {
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
similarity index 90%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
index 2918a42e37254c..e09a7dddcb2bea 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
@@ -5,7 +5,7 @@ import org.apache.spark.sql.Row
import org.scalatest.FlatSpec
import scala.language.reflectiveCalls
-class DependencyParserTest extends FlatSpec {
+class DependencyParserApproachTest extends FlatSpec {
def fixture = new {
val df = AnnotatorBuilder.withDependencyParser(DataBuilder.basicDataBuild(ContentProvider.depSentence))
val dependencies = df.select("dependency")
@@ -52,6 +52,6 @@ class DependencyParserTest extends FlatSpec {
val f = fixture
f.depAnnotations
.zip(f.tokenAnnotations)
- .foreach { case (dep, token) => assert(dep.begin == token.begin && dep.end == token.end, s"Token and word should have equal indixes") }
+ .foreach { case (dep, token) => assert(dep.start == token.start && dep.end == token.end, s"Token and word should have equal indixes") }
}
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
index 7a92c52cca9ede..306c1de43d0743 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
@@ -14,15 +14,15 @@ class GreedyTransitionApproachTest extends FlatSpec {
val tokenAnnotations = Annotation.collect(df, "token")
.flatten
- .sortBy { _.begin }
+ .sortBy { _.start }
val posTagAnnotations = Annotation.collect(df, "pos")
.flatten
- .sortBy { _.begin }
+ .sortBy { _.start }
val sentenceAnnotation = Annotation.collect(df, "sentence")
.flatten
- .sortBy { _.begin }
+ .sortBy { _.start }
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
index 230d883894a6fc..80beba5221be73 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
@@ -1,6 +1,6 @@
package com.johnsnowlabs.nlp.annotators.pos.perceptron
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.common.Sentence
import com.johnsnowlabs.nlp.{ContentProvider, DataBuilder}
import org.scalatest._
@@ -30,8 +30,7 @@ class PerceptronApproachTestSpec extends FlatSpec with PerceptronApproachBehavio
length += text.length + 1
sentence
}
-
- new RegexTokenizer().tag(sentences).toArray
+ new Tokenizer().tag(sentences).toArray
}
"an isolated perceptron tagger" should behave like isolatedPerceptronTagging(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala
index b119c6cd033982..977fd8076db05c 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala
@@ -1,6 +1,6 @@
package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp._
import org.apache.spark.storage.StorageLevel
import org.scalatest._
@@ -29,14 +29,14 @@ class PragmaticApproachBigTestSpec extends FlatSpec {
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setOutputCol("my_sbd_sentences")
val assembled = documentAssembler.transform(mergedSentences)
val sentenced = sentenceDetector.transform(assembled)
- val tokenizedFromDisk = new RegexTokenizer()
+ val tokenizedFromDisk = new Tokenizer()
.setInputCols(Array("my_sbd_sentences"))
.setOutputCol("token")
@@ -97,11 +97,11 @@ class PragmaticApproachTestSpec extends FlatSpec with PragmaticDetectionBehavior
)
"A Pragmatic SBD" should "be readable and writable" taggedAs Tag("LinuxOnly") in {
- val pragmaticDetector = new SentenceDetectorModel()
+ val pragmaticDetector = new SentenceDetector()
val path = "./test-output-tmp/pragmaticdetector"
try {
pragmaticDetector.write.overwrite.save(path)
- val pragmaticDetectorRead = SentenceDetectorModel.read.load(path)
+ val pragmaticDetectorRead = SentenceDetector.read.load(path)
} catch {
case _: java.io.IOException => succeed
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
similarity index 76%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
index 9683fc7518e7df..b1f053161ab7a3 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
@@ -4,11 +4,11 @@ import com.johnsnowlabs.nlp.annotators.common.Sentence
import org.scalatest.FlatSpec
-class SentenceDetectorModelBoundsSpec extends FlatSpec {
+class SentenceDetectorBoundsSpec extends FlatSpec {
val model = new PragmaticMethod(false)
- "SentenceDetectorModel" should "return correct sentence bounds" in {
+ "SentenceDetector" should "return correct sentence bounds" in {
val bounds = model.extractBounds("Hello World!! New Sentence", Array.empty[String])
assert(bounds.length == 2)
@@ -16,7 +16,7 @@ class SentenceDetectorModelBoundsSpec extends FlatSpec {
assert(bounds(1) == Sentence("New Sentence", 14, 25))
}
- "SentenceDetectorModel" should "correct return sentence bounds with whitespaces" in {
+ "SentenceDetector" should "correct return sentence bounds with whitespaces" in {
val bounds = model.extractBounds(" Hello World!! . New Sentence ", Array.empty[String])
assert(bounds.length == 3)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala
index 5b241cb7443c06..7ca5bb9ebfa316 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala
@@ -16,7 +16,7 @@ trait PragmaticSentimentBehaviors { this: FlatSpec =>
def isolatedSentimentDetector(tokenizedSentences: Array[TokenizedSentence], expectedScore: Double): Unit = {
s"tagged sentences" should s"have an expected score of $expectedScore" in {
- val pragmaticScorer = new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict("/sentiment-corpus/default-sentiment-dict.txt", "txt", ","))
+ val pragmaticScorer = new PragmaticScorer(SentimentDetector.retrieveSentimentDict("/sentiment-corpus/default-sentiment-dict.txt", "txt", ","))
val result = pragmaticScorer.score(tokenizedSentences)
assert(result == expectedScore, s"because result: $result did not match expected: $expectedScore")
}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala
index e9d3dbb59ae2d6..743cfb2494ce61 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala
@@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.annotators.sda.pragmatic
import com.johnsnowlabs.nlp.annotators.common.Sentence
import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
import org.apache.spark.storage.StorageLevel
import org.scalatest._
import org.scalatest.tagobjects.Slow
@@ -19,7 +19,7 @@ class PragmaticSentimentBigTestSpec extends FlatSpec {
val assembled = documentAssembler.transform(data)
- val sentimentDetector = new SentimentDetectorModel()
+ val sentimentDetector = new SentimentDetector()
val readyData = AnnotatorBuilder.withFullPOSTagger(AnnotatorBuilder.withFullLemmatizer(assembled))
@@ -61,7 +61,7 @@ class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehavio
"I recommend others to avoid because it is too expensive"
val sentimentSentences = {
- new RegexTokenizer().tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray
+ new Tokenizer().tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray
}
"an isolated sentiment detector" should behave like isolatedSentimentDetector(sentimentSentences, -4.0)
@@ -72,11 +72,11 @@ class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehavio
)
"A SentimentDetector" should "be readable and writable" in {
- val sentimentDetector = new SentimentDetectorModel()
+ val sentimentDetector = new SentimentDetector()
val path = "./test-output-tmp/sentimentdetector"
try {
sentimentDetector.write.overwrite.save(path)
- val sentimentDetectorRead = SentimentDetectorModel.read.load(path)
+ val sentimentDetectorRead = SentimentDetector.read.load(path)
assert(sentimentDetector.model.score(sentimentSentences) == sentimentDetectorRead.model.score(sentimentSentences))
} catch {
case _: java.io.IOException => succeed
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala
index 942bf77725a55c..e4f4460ac70830 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala
@@ -1,8 +1,8 @@
package com.johnsnowlabs.nlp.annotators.sda.vivekn
import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer}
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer}
import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.Row
@@ -41,11 +41,11 @@ class ViveknSentimentTestSpec extends FlatSpec {
.setInputCol("text")
.setOutputCol("document")
- val sentenceDetector = new SentenceDetectorModel()
+ val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala
index ec5d6f01b04f36..3d13d6a4aaddb1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala
@@ -1,6 +1,6 @@
package com.johnsnowlabs.nlp.annotators.spell.norvig
-import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer}
+import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer}
import com.johnsnowlabs.nlp._
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.{Dataset, Row}
@@ -38,7 +38,7 @@ trait NorvigSweetingBehaviors { this: FlatSpec =>
.setInputCol("text")
.setOutputCol("document")
- val tokenizer = new RegexTokenizer()
+ val tokenizer = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")