From db89cbb91c6de1cd6842dc82e79bbe405ff8c937 Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Thu, 25 Jan 2018 19:42:35 -0300 Subject: [PATCH 1/9] - Updated crf tests to use recursive pipeline - Added warning logs into not using recursive pipeline in ner --- python/sparknlp/annotator.py | 12 +- .../nlp/annotators/EntityExtractor.scala | 2 +- .../nlp/annotators/Normalizer.scala | 2 +- .../{RegexTokenizer.scala => Tokenizer.scala} | 24 ++- .../annotators/ner/crf/NerCrfApproach.scala | 12 +- ...r.scala => DependencyParserApproach.scala} | 4 +- ...ctorModel.scala => SentenceDetector.scala} | 4 +- .../sda/pragmatic/PragmaticScorer.scala | 2 +- ...torModel.scala => SentimentDetector.scala} | 6 +- .../nlp/util/io/ResourceHelper.scala | 4 +- .../nlp/util/regex/RuleFactory.scala | 168 +++++++++--------- .../ml/crf/CoNLL2003PipelineTest.scala | 8 +- .../johnsnowlabs/nlp/AnnotatorBuilder.scala | 14 +- .../johnsnowlabs/nlp/FinisherTestSpec.scala | 4 +- .../nlp/annotators/LemmatizerTestSpec.scala | 6 +- .../annotators/RegexTokenizerBehaviors.scala | 2 +- ...TestSpec.scala => TokenizerTestSpec.scala} | 6 +- ...> DependencyParserApproachModelTest.scala} | 2 +- ...ala => DependencyParserApproachTest.scala} | 2 +- .../PerceptronApproachTestSpec.scala | 4 +- .../pragmatic/PragmaticApproachTestSpec.scala | 10 +- ...scala => SentenceDetectorBoundsSpec.scala} | 2 +- .../PragmaticSentimentBehaviors.scala | 2 +- .../PragmaticSentimentTestSpec.scala | 10 +- .../sda/vivekn/ViveknSentimentTestSpec.scala | 8 +- .../norvig/NorvigSweetingBehaviors.scala | 4 +- 26 files changed, 173 insertions(+), 151 deletions(-) rename src/main/scala/com/johnsnowlabs/nlp/annotators/{RegexTokenizer.scala => Tokenizer.scala} (69%) rename src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/{DependencyParser.scala => DependencyParserApproach.scala} (82%) rename src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/{SentenceDetectorModel.scala => SentenceDetector.scala} (91%) rename src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/{SentimentDetectorModel.scala => SentimentDetector.scala} (91%) rename src/test/scala/com/johnsnowlabs/nlp/annotators/{RegexTokenizerTestSpec.scala => TokenizerTestSpec.scala} (87%) rename src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/{DependencyParserModelTest.scala => DependencyParserApproachModelTest.scala} (59%) rename src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/{DependencyParserTest.scala => DependencyParserApproachTest.scala} (97%) rename src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/{SentenceDetectorModelBoundsSpec.scala => SentenceDetectorBoundsSpec.scala} (93%) diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index dfecd5b8c6ca58..63e8cf208260d1 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -83,7 +83,7 @@ def __init__(self): super(JavaTransformer, self).__init__() -class RegexTokenizer(AnnotatorTransformer): +class Tokenizer(AnnotatorTransformer): pattern = Param(Params._dummy(), "pattern", @@ -92,8 +92,8 @@ class RegexTokenizer(AnnotatorTransformer): @keyword_only def __init__(self): - super(RegexTokenizer, self).__init__() - self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.RegexTokenizer", self.uid) + super(Tokenizer, self).__init__() + self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.Tokenizer", self.uid) def setPattern(self, value): return self._set(pattern=value) @@ -307,7 +307,7 @@ class PerceptronModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProper name = "PerceptronModel" -class SentenceDetectorModel(AnnotatorTransformer): +class SentenceDetector(AnnotatorTransformer): useAbbreviations = Param(Params._dummy(), "useAbbreviations", @@ -329,8 +329,8 @@ def setUseAbbreviations(self, value): @keyword_only def __init__(self): - super(SentenceDetectorModel, self).__init__() - self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel", self.uid) + super(SentenceDetector, self).__init__() + self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector", self.uid) class SentimentDetectorModel(AnnotatorTransformer): diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala index fe2ad96a5ff242..834156aec4756c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala @@ -79,7 +79,7 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt private def loadEntities(): Unit = { val src = EntityExtractor.retrieveEntityExtractorPhrases($(entitiesPath), $(entitiesFormat)) - val tokenizer = new RegexTokenizer().setPattern("\\w+") + val tokenizer = new Tokenizer().setPattern("\\w+") val normalizer = new Normalizer() val phrases: Array[Array[String]] = src.map { line => diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala index ea9b743406f0f8..bdca08c85a32c8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala @@ -19,7 +19,7 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] { val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space") val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase") - setDefault(pattern, "[^a-zA-Z]") + setDefault(pattern, "[^\\pL+]") setDefault(lowercase, true) def getPattern: String = $(pattern) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala similarity index 69% rename from src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala rename to src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala index 8a51b268cdcd12..dd2af41126988b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala @@ -12,7 +12,7 @@ import scala.util.matching.Regex * @param uid required uid for storing annotator to disk * @@ pattern: RegexPattern to split phrases into tokens */ -class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexTokenizer] { +class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] { import com.johnsnowlabs.nlp.AnnotatorType._ @@ -26,19 +26,33 @@ class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexToken def setPattern(value: String): this.type = set(pattern, value) + def addPattern(value: String) + + def setPrefixPattern(value: String) + + def setSuffixPattern(value: String) + + def addPrefixPattern(value: String) + + def addSuffixPattern(value: String) + def getPattern: String = $(pattern) + def getPrefixPattern: String + + def getSuffixPattern: String + setDefault(inputCols, Array(DOCUMENT)) /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */ override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT) - setDefault(pattern, "\\S+") + setDefault(pattern, "([^\\s\\w]?)(\\w+(?:\\.\\w{1}\\.|(?:\\-\\w+)*)?)([^\\s\\w]?)") def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = { sentences.map{text => - val tokens = regex.findAllMatchIn(text.content).map { m => - IndexedToken(m.matched, text.begin + m.start, text.begin + m.end - 1) + val tokens = regex.findAllMatchIn(text.content).flatMap { m => + (1 to m.groupCount).map (i => IndexedToken(m.group(i), text.begin + m.start, text.begin + m.end - 1)) }.toArray TokenizedSentence(tokens) } @@ -52,4 +66,4 @@ class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexToken } } -object RegexTokenizer extends DefaultParamsReadable[RegexTokenizer] \ No newline at end of file +object Tokenizer extends DefaultParamsReadable[Tokenizer] \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala index 795ccf0b4e76f3..be39a481839b68 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala @@ -3,11 +3,11 @@ package com.johnsnowlabs.nlp.annotators.ner.crf import com.johnsnowlabs.ml.crf.{CrfParams, LinearChainCrf, TextSentenceLabels, Verbose} import com.johnsnowlabs.nlp.{AnnotatorType, DocumentAssembler, HasRecursiveFit, RecursivePipeline} import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, NAMED_ENTITY, POS, TOKEN} -import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence import com.johnsnowlabs.nlp.annotators.common.NerTagged import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.datasets.CoNLL import com.johnsnowlabs.nlp.embeddings.ApproachWithWordEmbeddings import org.apache.spark.ml.{Pipeline, PipelineModel} @@ -88,19 +88,19 @@ class NerCrfApproach(override val uid: String) return recursivePipeline.get.transform(dataframe) } - logger.warn("NER CRF not in a RecursivePipeline." + - "It is recommended to use a com.jonsnowlabs.nlp.RecursivePipeline for" + + logger.warn("NER CRF not in a RecursivePipeline. " + + "It is recommended to use a com.jonsnowlabs.nlp.RecursivePipeline for " + "better performance during training") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") - val sentenceDetector = new SentenceDetectorModel() + val sentenceDetector = new SentenceDetector() .setCustomBoundChars(Array("\n\n")) .setInputCols(Array("document")) .setOutputCol("sentence") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala similarity index 82% rename from src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala rename to src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala index 29ef30f1fb03be..ca24eb703a29e1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala @@ -7,7 +7,7 @@ import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset -class DependencyParser(override val uid: String) extends AnnotatorApproach[DependencyParserModel] { +class DependencyParserApproach(override val uid: String) extends AnnotatorApproach[DependencyParserModel] { override val description: String = "Dependency Parser Estimator used to train" def this() = this(Identifiable.randomUID(DEPENDENCY)) @@ -26,4 +26,4 @@ class DependencyParser(override val uid: String) extends AnnotatorApproach[Depen } } -object DependencyParser extends DefaultParamsReadable[DependencyParser] \ No newline at end of file +object DependencyParserApproach extends DefaultParamsReadable[DependencyParserApproach] \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala similarity index 91% rename from src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala rename to src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala index 0612c7d5dfa723..e115cc9d50e5a1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala @@ -10,7 +10,7 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} * @param uid internal constructor requirement for serialization of params * @@ model: Model to use for boundaries detection */ -class SentenceDetectorModel(override val uid: String) extends AnnotatorModel[SentenceDetectorModel] { +class SentenceDetector(override val uid: String) extends AnnotatorModel[SentenceDetector] { import com.johnsnowlabs.nlp.AnnotatorType._ @@ -56,4 +56,4 @@ class SentenceDetectorModel(override val uid: String) extends AnnotatorModel[Sen } } -object SentenceDetectorModel extends DefaultParamsReadable[SentenceDetectorModel] \ No newline at end of file +object SentenceDetector extends DefaultParamsReadable[SentenceDetector] \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala index ef266d2ac2f27c..9a1e2add1dd9c1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala @@ -103,6 +103,6 @@ object PragmaticScorer { new PragmaticScorer(javaSentimentDict.asScala.toMap) } def fromPath(overridePath: String, sentFormat: String, sentSeparator: String) { - new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict(overridePath, sentFormat.toUpperCase, sentSeparator)) + new PragmaticScorer(SentimentDetector.retrieveSentimentDict(overridePath, sentFormat.toUpperCase, sentSeparator)) } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala similarity index 91% rename from src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala rename to src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala index 990d690fd519fb..d3da8db68b1960 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala @@ -16,7 +16,7 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} * @param uid internal uid needed for saving annotator to disk * @@ model: Implementation to be applied for sentiment analysis */ -class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[SentimentDetectorModel] { +class SentimentDetector(override val uid: String) extends AnnotatorModel[SentimentDetector] { import com.johnsnowlabs.nlp.AnnotatorType._ @@ -36,7 +36,7 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se setDefault(dictSeparator, config.getString("nlp.sentimentDict.separator")) lazy val model: PragmaticScorer = - new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict($(dictPath), $(dictFormat), $(dictSeparator))) + new PragmaticScorer(SentimentDetector.retrieveSentimentDict($(dictPath), $(dictFormat), $(dictSeparator))) override val annotatorType: AnnotatorType = SENTIMENT @@ -91,7 +91,7 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se } } -object SentimentDetectorModel extends DefaultParamsReadable[SentimentDetectorModel] { +object SentimentDetector extends DefaultParamsReadable[SentimentDetector] { /** * Sentiment dictionaries from compiled sources set in configuration diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index 740b19881a9398..f0a72955a66fd2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.util.io import java.io.{File, FileNotFoundException, InputStream} -import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer} +import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer} import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher} import com.johnsnowlabs.nlp.util.io.ResourceFormat._ import org.apache.spark.ml.Pipeline @@ -367,7 +367,7 @@ object ResourceHelper { val wordCount = MMap.empty[String, Int].withDefaultValue(0) val documentAssembler = new DocumentAssembler() .setInputCol("value") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols("document") .setOutputCol("token") .setPattern(tokenPattern) diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala index 38ea929b48ebdb..f8b8f12d411493 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala @@ -28,16 +28,15 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, import TransformStrategy._ import MatchStrategy._ - val logger = LoggerFactory.getLogger("RuleFactory") + /** Helper functions to identify context in a word for debugging */ + private val logger = LoggerFactory.getLogger("RuleFactory") + private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else 0 + private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength /** Rules and SymbolRules are key pieces of regex transformation */ private var rules: Seq[RegexRule] = Seq() private var symbolRules: Seq[(String, RegexRule)] = Seq() - /** Helper functions to identify context in a word for debugging */ - private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else 0 - private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength - /** Adds a rule to this factory*/ def addRule(rule: RegexRule): this.type = { rules = rules :+ rule @@ -50,6 +49,86 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, this } + /** Shortcut functions, no need to execute them on runtime since a strategy won't change in lifetime of Factory */ + private val findMatchFunc = (text: String) => matchStrategy match { + case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier))) + case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier))) + case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier))) + } + + private val transformMatchFunc = (text: String, regex: Regex, transform: Regex.Match => String) => matchStrategy match { + case MATCH_ALL => regex.replaceAllIn(text, transform) + case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text) + case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m => + regex.replaceFirstIn(text, transform(m))).getOrElse(text) + case _ => throw new IllegalArgumentException("Invalid match strategy") + } + + private val transformWithSymbolFunc = (text: String, symbol: String) => transformStrategy match { + case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)), + () => rule.identifier, + () => APPEND_WITH_SYMBOL) + "$0" + symbol + })) + case PREPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)), + () => rule.identifier, + () => PREPEND_WITH_SYMBOL) + symbol + "$0" + })) + case REPLACE_ALL_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), + () => rule.identifier, + () => REPLACE_ALL_WITH_SYMBOL) + symbol + })) + case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), + () => rule.identifier, + () => REPLACE_WITH_SYMBOL_AND_BREAK) + symbol + BREAK_INDICATOR + })) + case _ => throw new IllegalArgumentException("Invalid strategy for rule factory") + } + + private val transformWithSymbolicRulesFunc = (text: String) => transformStrategy match { + case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldRight(text)((rule, target) => transformMatch(target, rule._2.regex)({ m => + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), + () => rule._2.identifier, + () => REPLACE_EACH_WITH_SYMBOL) + rule._1 + })) + case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldRight(text)((rule, target) => rule._2.regex replaceAllIn( + target, m => { + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), + () => rule._2.identifier, + () => REPLACE_EACH_WITH_SYMBOL_AND_BREAK) + rule._1 + BREAK_INDICATOR + })) + case PROTECT_FROM_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => + logger.debug("Matched: {} from: {} using rule {} with strategy {}", + () => m.matched, + () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), + () => rule.identifier, + () => PROTECT_FROM_BREAK) + PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE + })) + case _ => throw new IllegalArgumentException("Invalid strategy for rule factory") + } + /** * Adds a rule and its associated symbol to apply some transformation using such symbol * @param symbol symbol is a character to be used in a transformation application, where many rules can apply different transformations @@ -75,11 +154,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, /**Applies factory match strategy to find matches and returns any number of Matches*/ def findMatch(text: String): Seq[RuleMatch] = { - matchStrategy match { - case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier))) - case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier))) - case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier))) - } + findMatchFunc(text) } /** Specifically finds a first match within a group of matches */ @@ -93,13 +168,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, * @return Resulting transformation */ private def transformMatch(text: String, regex: Regex)(transform: Regex.Match => String): String = { - matchStrategy match { - case MATCH_ALL => regex.replaceAllIn(text, transform) - case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text) - case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m => - regex.replaceFirstIn(text, transform(m))).getOrElse(text) - case _ => throw new IllegalArgumentException("Invalid match strategy") - } + transformMatchFunc(text: String, regex: Regex, transform: Regex.Match => String) } /** @@ -109,41 +178,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, * @return */ def transformWithSymbol(symbol: String, text: String): String = { - transformStrategy match { - case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)), - () => rule.identifier, - () => APPEND_WITH_SYMBOL) - "$0" + symbol - })) - case PREPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)), - () => rule.identifier, - () => PREPEND_WITH_SYMBOL) - symbol + "$0" - })) - case REPLACE_ALL_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), - () => rule.identifier, - () => REPLACE_ALL_WITH_SYMBOL) - symbol - })) - case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), - () => rule.identifier, - () => REPLACE_WITH_SYMBOL_AND_BREAK) - symbol + BREAK_INDICATOR - })) - case _ => throw new IllegalArgumentException("Invalid strategy for rule factory") - } + transformWithSymbolFunc(symbol, text) } /** @@ -152,34 +187,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, * @return Returns a transformed text */ def transformWithSymbolicRules(text: String): String = { - transformStrategy match { - case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldRight(text)((rule, target) => transformMatch(target, rule._2.regex)({ m => - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), - () => rule._2.identifier, - () => REPLACE_EACH_WITH_SYMBOL) - rule._1 - })) - case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldRight(text)((rule, target) => rule._2.regex replaceAllIn( - target, m => { - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), - () => rule._2.identifier, - () => REPLACE_EACH_WITH_SYMBOL_AND_BREAK) - rule._1 + BREAK_INDICATOR - })) - case PROTECT_FROM_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m => - logger.debug("Matched: {} from: {} using rule {} with strategy {}", - () => m.matched, - () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)), - () => rule.identifier, - () => PROTECT_FROM_BREAK) - PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE - })) - case _ => throw new IllegalArgumentException("Invalid strategy for rule factory") - } + transformWithSymbolicRulesFunc(text) } } object RuleFactory { diff --git a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala index 5d2c5a2fe0a561..d103c1f89b122f 100644 --- a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala @@ -1,12 +1,12 @@ package com.johnsnowlabs.ml.crf import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.common.Annotated.{NerTaggedSentence, PosTaggedSentence} import com.johnsnowlabs.nlp.annotators.common.{NerTagged, PosTagged, TaggedSentence} import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.datasets.CoNLL import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import org.apache.spark.ml.{PipelineModel, PipelineStage} @@ -30,12 +30,12 @@ object CoNLL2003PipelineTest extends App { .setInputCol("text") .setOutputCol("document") - val sentenceDetector = new SentenceDetectorModel() + val sentenceDetector = new SentenceDetector() .setCustomBoundChars(Array("\n\n")) .setInputCols(Array("document")) .setOutputCol("sentence") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index 0e67f45913aa83..6a7ce1197a81bf 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -2,10 +2,10 @@ package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators._ import com.johnsnowlabs.nlp.annotators.ner.crf.{NerCrfApproach, NerCrfModel} -import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParser +import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel -import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector +import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat @@ -26,7 +26,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => } def withTokenizer(dataset: Dataset[Row]): Dataset[Row] = { - val regexTokenizer = new RegexTokenizer() + val regexTokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") regexTokenizer.transform(withFullPragmaticSentenceDetector(dataset)) @@ -75,7 +75,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => } def withFullPragmaticSentenceDetector(dataset: Dataset[Row]): Dataset[Row] = { - val sentenceDetector = new SentenceDetectorModel() + val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") sentenceDetector.transform(dataset) @@ -109,7 +109,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => } def withPragmaticSentimentDetector(dataset: Dataset[Row]): Dataset[Row] = { - val sentimentDetector = new SentimentDetectorModel + val sentimentDetector = new SentimentDetector sentimentDetector .setInputCols(Array("token", "sentence")) .setOutputCol("sentiment") @@ -139,7 +139,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => def withDependencyParser(dataset: Dataset[Row]): Dataset[Row] = { val df = withFullPOSTagger(withTokenizer(dataset)) - new DependencyParser() + new DependencyParserApproach() .setInputCols(Array("sentence", "pos", "token")) .setOutputCol("dependency") .setSourcePath("src/test/resources/models/dep-model.txt") diff --git a/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala index 0e09cd6313f09c..99c8f306a69f45 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp -import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.Tokenizer import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.StopWordsRemover import org.scalatest._ @@ -14,7 +14,7 @@ class FinisherTestSpec extends FlatSpec { .setInputCol("text") .setOutputCol("document") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala index 0ed89436902aa4..92ab5cf6aca237 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp.annotators -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.types.ArrayType @@ -43,11 +43,11 @@ class LemmatizerTestSpec extends FlatSpec with LemmatizerBehaviors { .setInputCol("text") .setOutputCol("document") - val sentenceDetector = new SentenceDetectorModel() + val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala index 2aa10425d7d7d2..4915ffa57e018e 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp.annotators -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder, AnnotatorType} import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala similarity index 87% rename from src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala rename to src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala index 39fc39e98cd48f..f9947c29654b87 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala @@ -8,9 +8,9 @@ import java.util.Date /** * Created by saif on 02/05/17. */ -class RegexTokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors { +class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors { - val regexTokenizer = new RegexTokenizer + val regexTokenizer = new Tokenizer "a RegexTokenizer" should s"be of type ${AnnotatorType.TOKEN}" in { assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN) @@ -25,7 +25,7 @@ class RegexTokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors { val assembled = documentAssembler.transform(data) - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setOutputCol("token") val tokenized = tokenizer.transform(assembled) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala similarity index 59% rename from src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala rename to src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala index e73d3b1cfaf313..879f09b16c9d36 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala @@ -2,6 +2,6 @@ package com.johnsnowlabs.nlp.annotators.parser.dep import org.scalatest.FlatSpec -class DependencyParserModelTest extends FlatSpec { +class DependencyParserApproachModelTest extends FlatSpec { } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala similarity index 97% rename from src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala rename to src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala index 2918a42e37254c..c76e272bf94e5e 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala @@ -5,7 +5,7 @@ import org.apache.spark.sql.Row import org.scalatest.FlatSpec import scala.language.reflectiveCalls -class DependencyParserTest extends FlatSpec { +class DependencyParserApproachTest extends FlatSpec { def fixture = new { val df = AnnotatorBuilder.withDependencyParser(DataBuilder.basicDataBuild(ContentProvider.depSentence)) val dependencies = df.select("dependency") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala index 230d883894a6fc..51ea37f3235051 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp.annotators.pos.perceptron -import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.common.Sentence import com.johnsnowlabs.nlp.{ContentProvider, DataBuilder} import org.scalatest._ @@ -31,7 +31,7 @@ class PerceptronApproachTestSpec extends FlatSpec with PerceptronApproachBehavio sentence } - new RegexTokenizer().tag(sentences).toArray + new Tokenizer().tag(sentences).toArray } "an isolated perceptron tagger" should behave like isolatedPerceptronTagging( diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala index b119c6cd033982..977fd8076db05c 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic -import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp._ import org.apache.spark.storage.StorageLevel import org.scalatest._ @@ -29,14 +29,14 @@ class PragmaticApproachBigTestSpec extends FlatSpec { val documentAssembler = new DocumentAssembler() .setInputCol("text") - val sentenceDetector = new SentenceDetectorModel() + val sentenceDetector = new SentenceDetector() .setOutputCol("my_sbd_sentences") val assembled = documentAssembler.transform(mergedSentences) val sentenced = sentenceDetector.transform(assembled) - val tokenizedFromDisk = new RegexTokenizer() + val tokenizedFromDisk = new Tokenizer() .setInputCols(Array("my_sbd_sentences")) .setOutputCol("token") @@ -97,11 +97,11 @@ class PragmaticApproachTestSpec extends FlatSpec with PragmaticDetectionBehavior ) "A Pragmatic SBD" should "be readable and writable" taggedAs Tag("LinuxOnly") in { - val pragmaticDetector = new SentenceDetectorModel() + val pragmaticDetector = new SentenceDetector() val path = "./test-output-tmp/pragmaticdetector" try { pragmaticDetector.write.overwrite.save(path) - val pragmaticDetectorRead = SentenceDetectorModel.read.load(path) + val pragmaticDetectorRead = SentenceDetector.read.load(path) } catch { case _: java.io.IOException => succeed } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala similarity index 93% rename from src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala rename to src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala index 9683fc7518e7df..8aca483f910e7a 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala @@ -4,7 +4,7 @@ import com.johnsnowlabs.nlp.annotators.common.Sentence import org.scalatest.FlatSpec -class SentenceDetectorModelBoundsSpec extends FlatSpec { +class SentenceDetectorBoundsSpec extends FlatSpec { val model = new PragmaticMethod(false) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala index 5b241cb7443c06..7ca5bb9ebfa316 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala @@ -16,7 +16,7 @@ trait PragmaticSentimentBehaviors { this: FlatSpec => def isolatedSentimentDetector(tokenizedSentences: Array[TokenizedSentence], expectedScore: Double): Unit = { s"tagged sentences" should s"have an expected score of $expectedScore" in { - val pragmaticScorer = new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict("/sentiment-corpus/default-sentiment-dict.txt", "txt", ",")) + val pragmaticScorer = new PragmaticScorer(SentimentDetector.retrieveSentimentDict("/sentiment-corpus/default-sentiment-dict.txt", "txt", ",")) val result = pragmaticScorer.score(tokenizedSentences) assert(result == expectedScore, s"because result: $result did not match expected: $expectedScore") } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala index e9d3dbb59ae2d6..743cfb2494ce61 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala @@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.annotators.sda.pragmatic import com.johnsnowlabs.nlp.annotators.common.Sentence import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.Tokenizer import org.apache.spark.storage.StorageLevel import org.scalatest._ import org.scalatest.tagobjects.Slow @@ -19,7 +19,7 @@ class PragmaticSentimentBigTestSpec extends FlatSpec { val assembled = documentAssembler.transform(data) - val sentimentDetector = new SentimentDetectorModel() + val sentimentDetector = new SentimentDetector() val readyData = AnnotatorBuilder.withFullPOSTagger(AnnotatorBuilder.withFullLemmatizer(assembled)) @@ -61,7 +61,7 @@ class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehavio "I recommend others to avoid because it is too expensive" val sentimentSentences = { - new RegexTokenizer().tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray + new Tokenizer().tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray } "an isolated sentiment detector" should behave like isolatedSentimentDetector(sentimentSentences, -4.0) @@ -72,11 +72,11 @@ class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehavio ) "A SentimentDetector" should "be readable and writable" in { - val sentimentDetector = new SentimentDetectorModel() + val sentimentDetector = new SentimentDetector() val path = "./test-output-tmp/sentimentdetector" try { sentimentDetector.write.overwrite.save(path) - val sentimentDetectorRead = SentimentDetectorModel.read.load(path) + val sentimentDetectorRead = SentimentDetector.read.load(path) assert(sentimentDetector.model.score(sentimentSentences) == sentimentDetectorRead.model.score(sentimentSentences)) } catch { case _: java.io.IOException => succeed diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala index 942bf77725a55c..e4f4460ac70830 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala @@ -1,8 +1,8 @@ package com.johnsnowlabs.nlp.annotators.sda.vivekn import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel -import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer} +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector +import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer} import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.Row @@ -41,11 +41,11 @@ class ViveknSentimentTestSpec extends FlatSpec { .setInputCol("text") .setOutputCol("document") - val sentenceDetector = new SentenceDetectorModel() + val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala index ec5d6f01b04f36..3d13d6a4aaddb1 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp.annotators.spell.norvig -import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer} +import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer} import com.johnsnowlabs.nlp._ import org.apache.spark.ml.Pipeline import org.apache.spark.sql.{Dataset, Row} @@ -38,7 +38,7 @@ trait NorvigSweetingBehaviors { this: FlatSpec => .setInputCol("text") .setOutputCol("document") - val tokenizer = new RegexTokenizer() + val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") From 4665a161d35d99c12a3ec795a6f95928a766d2c0 Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Fri, 26 Jan 2018 16:03:22 -0300 Subject: [PATCH 2/9] - Tokenizer wip --- .../com/johnsnowlabs/nlp/AnnotatorModel.scala | 4 ++ .../nlp/annotators/EntityExtractor.scala | 2 +- .../nlp/annotators/Tokenizer.scala | 68 +++++++++++++------ .../sda/pragmatic/SentimentDetector.scala | 24 ++----- .../nlp/util/io/ResourceHelper.scala | 2 +- .../nlp/util/regex/RegexRule.scala | 2 +- .../nlp/util/regex/RuleFactory.scala | 7 +- .../nlp/annotators/TokenizerTestSpec.scala | 21 +++++- 8 files changed, 87 insertions(+), 43 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index d3ed36a2daff82..4fca2ce01d9a52 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -53,6 +53,9 @@ abstract class AnnotatorModel[M <: Model[M]] StructType(outputFields) } + /** override this function if you need to reset or clear annotate variables just once before annotating */ + def beforeAnnotate(): Unit = {} + /** * Given requirements are met, this applies ML transformation within a Pipeline or stand-alone * Output annotation will be generated as a new column, previous annotations are still available separately @@ -65,6 +68,7 @@ abstract class AnnotatorModel[M <: Model[M]] s"${requiredAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", annotatorType) + beforeAnnotate() dataset.withColumn( getOutputCol, dfAnnotate( diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala index 834156aec4756c..0273cf9a6ab24a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala @@ -79,7 +79,7 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt private def loadEntities(): Unit = { val src = EntityExtractor.retrieveEntityExtractorPhrases($(entitiesPath), $(entitiesFormat)) - val tokenizer = new Tokenizer().setPattern("\\w+") + val tokenizer = new Tokenizer() val normalizer = new Normalizer() val phrases: Array[Array[String]] = src.map { line => diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala index dd2af41126988b..610dd43d48ecb4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala @@ -1,7 +1,8 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.annotators.common._ -import org.apache.spark.ml.param.Param +import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory} +import org.apache.spark.ml.param.{Param, StringArrayParam} import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} @@ -16,44 +17,71 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] { import com.johnsnowlabs.nlp.AnnotatorType._ - val pattern: Param[String] = new Param(this, "pattern", "this is the token pattern") - - lazy val regex: Regex = $(pattern).r + val wordPattern: Param[String] = new Param(this, "wordPattern", "this is the base word pattern. Defaults \\w+") + val extensionPattern: StringArrayParam = new StringArrayParam(this, "infixPattern", "infix patterns allow for word exceptions that count as single token. E.g. U.S.A. Defaults ") + val prefixPattern: StringArrayParam = new StringArrayParam(this, "prefixPattern", "this is the token pattern") + val suffixPattern: StringArrayParam = new StringArrayParam(this, "suffixPattern", "this is the token pattern") override val annotatorType: AnnotatorType = TOKEN + /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */ + override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT) + def this() = this(Identifiable.randomUID("REGEX_TOKENIZER")) - def setPattern(value: String): this.type = set(pattern, value) + def setWordPattern(value: String): this.type = set(wordPattern, value) - def addPattern(value: String) + def setExtensionPattern(value: Array[String]): this.type = set(extensionPattern, value) - def setPrefixPattern(value: String) + def addExtensionPattern(value: String): this.type = set(extensionPattern, $(extensionPattern) :+ value) - def setSuffixPattern(value: String) + def setPrefixPattern(value: Array[String]): this.type = set(prefixPattern, value) - def addPrefixPattern(value: String) + def addPrefixPattern(value: String): this.type = set(prefixPattern, $(prefixPattern) :+ value) - def addSuffixPattern(value: String) + def setSuffixPattern(value: Array[String]): this.type = set(suffixPattern, value) - def getPattern: String = $(pattern) + def addSuffixPattern(value: String): this.type = set(suffixPattern, $(suffixPattern) :+ value) - def getPrefixPattern: String + def getWordPattern: String = $(wordPattern) - def getSuffixPattern: String + def getInfixPattern: Array[String] = $(extensionPattern) - setDefault(inputCols, Array(DOCUMENT)) + def getPrefixPattern: Array[String] = $(prefixPattern) - /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */ - override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT) + def getSuffixPattern: Array[String] = $(suffixPattern) - setDefault(pattern, "([^\\s\\w]?)(\\w+(?:\\.\\w{1}\\.|(?:\\-\\w+)*)?)([^\\s\\w]?)") + setDefault(inputCols, Array(DOCUMENT)) + + setDefault(wordPattern, "\\w+") + setDefault(extensionPattern, Array("\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*")) + setDefault(prefixPattern, Array("([^\\s\\w]?)")) + setDefault(suffixPattern, Array("([^\\s\\w]?)")) + + val ruleFactory = new RuleFactory(MatchStrategy.MATCH_ALL) + + override def beforeAnnotate(): Unit = { + /** Clears out rules and constructs a new rule for every combination of rules provided */ + /** The strategy is to catch one token per regex group */ + /** User may add its own groups if needs targets to be tokenized separately from the rest */ + /** "([^\\s\\w]?)(\\w+(?:\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*)?)([^\\s\\w]?)" */ + ruleFactory + .clearRules() + $(prefixPattern).foreach(pp => $(suffixPattern).foreach (sp => $(extensionPattern).foreach(ep => { + ruleFactory.addRule( + (pp + "(" + $(wordPattern) + "(?:" + ep + ")?" + ")" + sp).r, + "tokenizer construction pattern" + ) + }))) + } def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = { sentences.map{text => - val tokens = regex.findAllMatchIn(text.content).flatMap { m => - (1 to m.groupCount).map (i => IndexedToken(m.group(i), text.begin + m.start, text.begin + m.end - 1)) - }.toArray + val tokens = ruleFactory.findMatch(text.content).flatMap { m => + (1 to m.content.groupCount) + .map (i => IndexedToken(m.content.group(i), text.begin + m.content.start, text.begin + m.content.end - 1)) + }.filter(t => t.token.nonEmpty).toArray + tokens.foreach(t => println(t.token)) TokenizedSentence(tokens) } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala index d3da8db68b1960..0fa6362eb45650 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala @@ -44,29 +44,17 @@ class SentimentDetector(override val uid: String) extends AnnotatorModel[Sentime def this() = this(Identifiable.randomUID("SENTIMENT")) - def setDictPath(path: String): this.type = { - set(dictPath, path) - } + def setDictPath(path: String): this.type = set(dictPath, path) - def getDictPath: String = { - $(dictPath) - } + def getDictPath: String = $(dictPath) - def setDictFormat(format: String): this.type = { - set(dictFormat, format) - } + def setDictFormat(format: String): this.type = set(dictFormat, format) - def getDictFormat: String = { - $(dictFormat) - } + def getDictFormat: String = $(dictFormat) - def setDictSeparator(separator: String): this.type = { - set(dictSeparator, separator) - } + def setDictSeparator(separator: String): this.type = set(dictSeparator, separator) - def getDictSeparator: String = { - $(dictSeparator) - } + def getDictSeparator: String = $(dictSeparator) /** * Tokens are needed to identify each word in a sentence boundary diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index f0a72955a66fd2..a3d4837011d899 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -370,7 +370,7 @@ object ResourceHelper { val tokenizer = new Tokenizer() .setInputCols("document") .setOutputCol("token") - .setPattern(tokenPattern) + .setWordPattern(tokenPattern) val normalizer = new Normalizer() .setInputCols("token") .setOutputCol("normal") diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala index df81da027773fd..cb3428feafbb55 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala @@ -7,7 +7,7 @@ import scala.util.matching.Regex * @param rx a java.matching.Regex object * @param identifier some description that might help link the regex to its meaning */ -class RegexRule(rx: Regex, val identifier: String) { +class RegexRule(rx: Regex, val identifier: String) extends Serializable { def this(rx: String, identifier: String) { this(rx.r, identifier) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala index f8b8f12d411493..f7a544d9b1d650 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala @@ -16,7 +16,7 @@ import scala.util.matching.Regex */ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM) - extends RuleSymbols { + extends RuleSymbols with Serializable { /** * Internal representation of a regex match @@ -49,6 +49,11 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy, this } + def clearRules(): this.type = { + rules = Seq.empty[RegexRule] + this + } + /** Shortcut functions, no need to execute them on runtime since a strategy won't change in lifetime of Factory */ private val findMatchFunc = (text: String) => matchStrategy match { case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier))) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala index f9947c29654b87..10b7786142e756 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala @@ -5,6 +5,9 @@ import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ import java.util.Date +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector +import org.apache.spark.ml.Pipeline + /** * Created by saif on 02/05/17. */ @@ -12,10 +15,26 @@ class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors { val regexTokenizer = new Tokenizer - "a RegexTokenizer" should s"be of type ${AnnotatorType.TOKEN}" in { + "a Tokenizer" should s"be of type ${AnnotatorType.TOKEN}" in { assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN) } + "a Tokenizer" should "correctly tokenize target text on its defaults parameters" in { + val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.") + import data.sparkSession.implicits._ + val tokenizer = new Tokenizer().setInputCols("text").setOutputCol("token") + val sentence = new SentenceDetector().setInputCols("token").setOutputCol("sentence") + val finisher = new Finisher().setInputCols("sentence")//.setOutputAsArray(true) + val pipeline = new Pipeline().setStages(Array(tokenizer, sentence, finisher)) + pipeline.fit(data).transform(data).select("finished_sentence").show + assert(pipeline.fit(data).transform(data).select("output").as[Array[String]] + .collect + .sameElements(Array( + "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".", + "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crasus", ".") + )) + } + "a spark based tokenizer" should "resolve big data" in { val data = ContentProvider.parquetData.limit(500000) .repartition(16) From 4f9ff021b173aec68ac69f15f2872cb52dfaf897 Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Fri, 26 Jan 2018 16:48:11 -0300 Subject: [PATCH 3/9] - Tokenizer testing use cases --- docs/components.html | 22 +++---- docs/notebooks.html | 4 +- docs/quickstart.html | 10 +-- python/example/crf-ner/ner.ipynb | 4 +- python/example/crf-ner/ner_benchmark.ipynb | 4 +- .../dictionary-sentiment/sentiment.ipynb | 6 +- .../entities-extractor/extractor.ipynb | 4 +- .../example/vivekn-sentiment/sentiment.ipynb | 65 +++++++++++++++---- python/sparknlp/annotator.py | 6 +- python/test/annotators.py | 24 +++---- .../nlp/annotators/Tokenizer.scala | 8 +-- ...haviors.scala => TokenizerBehaviors.scala} | 12 ++-- .../nlp/annotators/TokenizerTestSpec.scala | 51 +++++++++++---- .../SentenceDetectorBoundsSpec.scala | 4 +- 14 files changed, 146 insertions(+), 78 deletions(-) rename src/test/scala/com/johnsnowlabs/nlp/annotators/{RegexTokenizerBehaviors.scala => TokenizerBehaviors.scala} (80%) diff --git a/docs/components.html b/docs/components.html index d191942d012412..57314aba919844 100644 --- a/docs/components.html +++ b/docs/components.html @@ -172,7 +172,7 @@

1. DocumentAssembler: Getting -

2. RegexTokenizer: Word tokens

+

2. Tokenizer: Word tokens

Example:

-
sentence_detector = SentenceDetectorModel() \
+                                            
sentence_detector = SentenceDetector() \
   .setInputCols(["document"]) \
   .setOutputCol("sentence") \
   .setUseAbbreviations(True)
@@ -673,7 +673,7 @@

9. SentenceDetector: Sentence B Example:

-
val sentenceDetector = new SentenceDetectorModel()
+                                            
val sentenceDetector = new SentenceDetector()
   .setInputCols("document")
   .setOutputCol("sentence")
@@ -790,7 +790,7 @@

11. SentimentDetector: Sentime
Example:

-
sentiment_detector = SentimentDetectorModel() \
+                                            
sentiment_detector = SentimentDetector() \
   .setInputCols(["lemma", "sentence"]) \
   .setOutputCol("sentiment")
@@ -825,7 +825,7 @@

11. SentimentDetector: Sentime
Example:

-
val sentimentDetector = new SentimentDetectorModel
+                                            
val sentimentDetector = new SentimentDetector
   .setInputCols(Array("token", "sentence"))
   .setOutputCol("sentiment")
@@ -902,7 +902,7 @@

13. SpellChecker: Token spell Inputs: Any text for corpus. A list of words for dictionary. A comma separated custom dictionary.
- Requires: RegexTokenizer
+ Requires: Tokenizer
Functions:
  • @@ -947,7 +947,7 @@

    13. SpellChecker: Token spell Inputs: Any text for corpus. A list of words for dictionary. A comma separated custom dictionary.
    - Requires: RegexTokenizer
    + Requires: Tokenizer
    Functions:
    • @@ -1017,7 +1017,7 @@

      14. ViveknSentimentDetec Input: File or folder of text files of positive and negative data
      Example:

      -
      sentiment_detector = SentimentDetectorModel() \
      +                                            
      sentiment_detector = SentimentDetector() \
           .setInputCols(["lemma", "sentence"]) \
           .setOutputCol("sentiment")
      @@ -1225,7 +1225,7 @@

      16. TokenAssembler: Getting data Annotators