From db89cbb91c6de1cd6842dc82e79bbe405ff8c937 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Thu, 25 Jan 2018 19:42:35 -0300
Subject: [PATCH 1/9] - Updated crf tests to use recursive pipeline - Added
 warning logs into not using recursive pipeline in ner

---
 python/sparknlp/annotator.py                  |  12 +-
 .../nlp/annotators/EntityExtractor.scala      |   2 +-
 .../nlp/annotators/Normalizer.scala           |   2 +-
 .../{RegexTokenizer.scala => Tokenizer.scala} |  24 ++-
 .../annotators/ner/crf/NerCrfApproach.scala   |  12 +-
 ...r.scala => DependencyParserApproach.scala} |   4 +-
 ...ctorModel.scala => SentenceDetector.scala} |   4 +-
 .../sda/pragmatic/PragmaticScorer.scala       |   2 +-
 ...torModel.scala => SentimentDetector.scala} |   6 +-
 .../nlp/util/io/ResourceHelper.scala          |   4 +-
 .../nlp/util/regex/RuleFactory.scala          | 168 +++++++++---------
 .../ml/crf/CoNLL2003PipelineTest.scala        |   8 +-
 .../johnsnowlabs/nlp/AnnotatorBuilder.scala   |  14 +-
 .../johnsnowlabs/nlp/FinisherTestSpec.scala   |   4 +-
 .../nlp/annotators/LemmatizerTestSpec.scala   |   6 +-
 .../annotators/RegexTokenizerBehaviors.scala  |   2 +-
 ...TestSpec.scala => TokenizerTestSpec.scala} |   6 +-
 ...> DependencyParserApproachModelTest.scala} |   2 +-
 ...ala => DependencyParserApproachTest.scala} |   2 +-
 .../PerceptronApproachTestSpec.scala          |   4 +-
 .../pragmatic/PragmaticApproachTestSpec.scala |  10 +-
 ...scala => SentenceDetectorBoundsSpec.scala} |   2 +-
 .../PragmaticSentimentBehaviors.scala         |   2 +-
 .../PragmaticSentimentTestSpec.scala          |  10 +-
 .../sda/vivekn/ViveknSentimentTestSpec.scala  |   8 +-
 .../norvig/NorvigSweetingBehaviors.scala      |   4 +-
 26 files changed, 173 insertions(+), 151 deletions(-)
 rename src/main/scala/com/johnsnowlabs/nlp/annotators/{RegexTokenizer.scala => Tokenizer.scala} (69%)
 rename src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/{DependencyParser.scala => DependencyParserApproach.scala} (82%)
 rename src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/{SentenceDetectorModel.scala => SentenceDetector.scala} (91%)
 rename src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/{SentimentDetectorModel.scala => SentimentDetector.scala} (91%)
 rename src/test/scala/com/johnsnowlabs/nlp/annotators/{RegexTokenizerTestSpec.scala => TokenizerTestSpec.scala} (87%)
 rename src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/{DependencyParserModelTest.scala => DependencyParserApproachModelTest.scala} (59%)
 rename src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/{DependencyParserTest.scala => DependencyParserApproachTest.scala} (97%)
 rename src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/{SentenceDetectorModelBoundsSpec.scala => SentenceDetectorBoundsSpec.scala} (93%)

diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index dfecd5b8c6ca58..63e8cf208260d1 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -83,7 +83,7 @@ def __init__(self):
         super(JavaTransformer, self).__init__()
 
 
-class RegexTokenizer(AnnotatorTransformer):
+class Tokenizer(AnnotatorTransformer):
 
     pattern = Param(Params._dummy(),
                     "pattern",
@@ -92,8 +92,8 @@ class RegexTokenizer(AnnotatorTransformer):
 
     @keyword_only
     def __init__(self):
-        super(RegexTokenizer, self).__init__()
-        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.RegexTokenizer", self.uid)
+        super(Tokenizer, self).__init__()
+        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.Tokenizer", self.uid)
 
     def setPattern(self, value):
         return self._set(pattern=value)
@@ -307,7 +307,7 @@ class PerceptronModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProper
     name = "PerceptronModel"
 
 
-class SentenceDetectorModel(AnnotatorTransformer):
+class SentenceDetector(AnnotatorTransformer):
 
     useAbbreviations = Param(Params._dummy(),
                              "useAbbreviations",
@@ -329,8 +329,8 @@ def setUseAbbreviations(self, value):
 
     @keyword_only
     def __init__(self):
-        super(SentenceDetectorModel, self).__init__()
-        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel", self.uid)
+        super(SentenceDetector, self).__init__()
+        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector", self.uid)
 
 
 class SentimentDetectorModel(AnnotatorTransformer):
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
index fe2ad96a5ff242..834156aec4756c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
@@ -79,7 +79,7 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt
   private def loadEntities(): Unit = {
     val src = EntityExtractor.retrieveEntityExtractorPhrases($(entitiesPath), $(entitiesFormat))
 
-    val tokenizer = new RegexTokenizer().setPattern("\\w+")
+    val tokenizer = new Tokenizer().setPattern("\\w+")
     val normalizer = new Normalizer()
     val phrases: Array[Array[String]] = src.map {
       line =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
index ea9b743406f0f8..bdca08c85a32c8 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
@@ -19,7 +19,7 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
   val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space")
   val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")
 
-  setDefault(pattern, "[^a-zA-Z]")
+  setDefault(pattern, "[^\\pL+]")
   setDefault(lowercase, true)
 
   def getPattern: String = $(pattern)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
similarity index 69%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index 8a51b268cdcd12..dd2af41126988b 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -12,7 +12,7 @@ import scala.util.matching.Regex
   * @param uid required uid for storing annotator to disk
   * @@ pattern: RegexPattern to split phrases into tokens
   */
-class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexTokenizer] {
+class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   import com.johnsnowlabs.nlp.AnnotatorType._
 
@@ -26,19 +26,33 @@ class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexToken
 
   def setPattern(value: String): this.type = set(pattern, value)
 
+  def addPattern(value: String)
+
+  def setPrefixPattern(value: String)
+
+  def setSuffixPattern(value: String)
+
+  def addPrefixPattern(value: String)
+
+  def addSuffixPattern(value: String)
+
   def getPattern: String = $(pattern)
 
+  def getPrefixPattern: String
+
+  def getSuffixPattern: String
+
   setDefault(inputCols, Array(DOCUMENT))
 
   /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */
   override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
 
-  setDefault(pattern, "\\S+")
+  setDefault(pattern, "([^\\s\\w]?)(\\w+(?:\\.\\w{1}\\.|(?:\\-\\w+)*)?)([^\\s\\w]?)")
 
   def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
     sentences.map{text =>
-      val tokens = regex.findAllMatchIn(text.content).map { m =>
-        IndexedToken(m.matched, text.begin + m.start, text.begin + m.end - 1)
+      val tokens = regex.findAllMatchIn(text.content).flatMap { m =>
+        (1 to m.groupCount).map (i => IndexedToken(m.group(i), text.begin + m.start, text.begin + m.end - 1))
       }.toArray
       TokenizedSentence(tokens)
     }
@@ -52,4 +66,4 @@ class RegexTokenizer(override val uid: String) extends AnnotatorModel[RegexToken
   }
 }
 
-object RegexTokenizer extends DefaultParamsReadable[RegexTokenizer]
\ No newline at end of file
+object Tokenizer extends DefaultParamsReadable[Tokenizer]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala
index 795ccf0b4e76f3..be39a481839b68 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala
@@ -3,11 +3,11 @@ package com.johnsnowlabs.nlp.annotators.ner.crf
 import com.johnsnowlabs.ml.crf.{CrfParams, LinearChainCrf, TextSentenceLabels, Verbose}
 import com.johnsnowlabs.nlp.{AnnotatorType, DocumentAssembler, HasRecursiveFit, RecursivePipeline}
 import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, NAMED_ENTITY, POS, TOKEN}
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
 import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
 import com.johnsnowlabs.nlp.annotators.common.NerTagged
 import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
 import com.johnsnowlabs.nlp.datasets.CoNLL
 import com.johnsnowlabs.nlp.embeddings.ApproachWithWordEmbeddings
 import org.apache.spark.ml.{Pipeline, PipelineModel}
@@ -88,19 +88,19 @@ class NerCrfApproach(override val uid: String)
       return recursivePipeline.get.transform(dataframe)
     }
 
-    logger.warn("NER CRF not in a RecursivePipeline." +
-      "It is recommended to use a com.jonsnowlabs.nlp.RecursivePipeline for" +
+    logger.warn("NER CRF not in a RecursivePipeline. " +
+      "It is recommended to use a com.jonsnowlabs.nlp.RecursivePipeline for " +
       "better performance during training")
     val documentAssembler = new DocumentAssembler()
       .setInputCol("text")
       .setOutputCol("document")
 
-    val sentenceDetector = new SentenceDetectorModel()
+    val sentenceDetector = new SentenceDetector()
       .setCustomBoundChars(Array("\n\n"))
       .setInputCols(Array("document"))
       .setOutputCol("sentence")
 
-    val tokenizer = new RegexTokenizer()
+    val tokenizer = new Tokenizer()
       .setInputCols(Array("document"))
       .setOutputCol("token")
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala
similarity index 82%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala
index 29ef30f1fb03be..ca24eb703a29e1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproach.scala
@@ -7,7 +7,7 @@ import org.apache.spark.ml.param.Param
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 import org.apache.spark.sql.Dataset
 
-class DependencyParser(override val uid: String) extends AnnotatorApproach[DependencyParserModel] {
+class DependencyParserApproach(override val uid: String) extends AnnotatorApproach[DependencyParserModel] {
   override val description: String = "Dependency Parser Estimator used to train"
 
   def this() = this(Identifiable.randomUID(DEPENDENCY))
@@ -26,4 +26,4 @@ class DependencyParser(override val uid: String) extends AnnotatorApproach[Depen
   }
 }
 
-object DependencyParser extends DefaultParamsReadable[DependencyParser]
\ No newline at end of file
+object DependencyParserApproach extends DefaultParamsReadable[DependencyParserApproach]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
similarity index 91%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
index 0612c7d5dfa723..e115cc9d50e5a1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
@@ -10,7 +10,7 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
   * @param uid internal constructor requirement for serialization of params
   * @@ model: Model to use for boundaries detection
   */
-class SentenceDetectorModel(override val uid: String) extends AnnotatorModel[SentenceDetectorModel] {
+class SentenceDetector(override val uid: String) extends AnnotatorModel[SentenceDetector] {
 
   import com.johnsnowlabs.nlp.AnnotatorType._
 
@@ -56,4 +56,4 @@ class SentenceDetectorModel(override val uid: String) extends AnnotatorModel[Sen
   }
 }
 
-object SentenceDetectorModel extends DefaultParamsReadable[SentenceDetectorModel]
\ No newline at end of file
+object SentenceDetector extends DefaultParamsReadable[SentenceDetector]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala
index ef266d2ac2f27c..9a1e2add1dd9c1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticScorer.scala
@@ -103,6 +103,6 @@ object PragmaticScorer {
     new PragmaticScorer(javaSentimentDict.asScala.toMap)
   }
   def fromPath(overridePath: String, sentFormat: String, sentSeparator: String) {
-    new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict(overridePath, sentFormat.toUpperCase, sentSeparator))
+    new PragmaticScorer(SentimentDetector.retrieveSentimentDict(overridePath, sentFormat.toUpperCase, sentSeparator))
   }
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
similarity index 91%
rename from src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala
rename to src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
index 990d690fd519fb..d3da8db68b1960 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetectorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
@@ -16,7 +16,7 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
   * @param uid internal uid needed for saving annotator to disk
   * @@ model: Implementation to be applied for sentiment analysis
   */
-class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[SentimentDetectorModel] {
+class SentimentDetector(override val uid: String) extends AnnotatorModel[SentimentDetector] {
 
   import com.johnsnowlabs.nlp.AnnotatorType._
 
@@ -36,7 +36,7 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se
   setDefault(dictSeparator, config.getString("nlp.sentimentDict.separator"))
 
   lazy val model: PragmaticScorer =
-    new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict($(dictPath), $(dictFormat), $(dictSeparator)))
+    new PragmaticScorer(SentimentDetector.retrieveSentimentDict($(dictPath), $(dictFormat), $(dictSeparator)))
 
   override val annotatorType: AnnotatorType = SENTIMENT
 
@@ -91,7 +91,7 @@ class SentimentDetectorModel(override val uid: String) extends AnnotatorModel[Se
   }
 
 }
-object SentimentDetectorModel extends DefaultParamsReadable[SentimentDetectorModel] {
+object SentimentDetector extends DefaultParamsReadable[SentimentDetector] {
 
   /**
     * Sentiment dictionaries from compiled sources set in configuration
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
index 740b19881a9398..f0a72955a66fd2 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
@@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.util.io
 
 import java.io.{File, FileNotFoundException, InputStream}
 
-import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer}
+import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer}
 import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher}
 import com.johnsnowlabs.nlp.util.io.ResourceFormat._
 import org.apache.spark.ml.Pipeline
@@ -367,7 +367,7 @@ object ResourceHelper {
         val wordCount = MMap.empty[String, Int].withDefaultValue(0)
         val documentAssembler = new DocumentAssembler()
           .setInputCol("value")
-        val tokenizer = new RegexTokenizer()
+        val tokenizer = new Tokenizer()
           .setInputCols("document")
           .setOutputCol("token")
           .setPattern(tokenPattern)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
index 38ea929b48ebdb..f8b8f12d411493 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
@@ -28,16 +28,15 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
   import TransformStrategy._
   import MatchStrategy._
 
-  val logger = LoggerFactory.getLogger("RuleFactory")
+  /** Helper functions to identify context in a word for debugging */
+  private val logger = LoggerFactory.getLogger("RuleFactory")
+  private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else  0
+  private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength
 
   /** Rules and SymbolRules are key pieces of regex transformation */
   private var rules: Seq[RegexRule] = Seq()
   private var symbolRules: Seq[(String, RegexRule)] = Seq()
 
-  /** Helper functions to identify context in a word for debugging */
-  private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else  0
-  private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength
-
   /** Adds a rule to this factory*/
   def addRule(rule: RegexRule): this.type = {
     rules = rules :+ rule
@@ -50,6 +49,86 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
     this
   }
 
+  /** Shortcut functions, no need to execute them on runtime since a strategy won't change in lifetime of Factory */
+  private val findMatchFunc = (text: String) => matchStrategy match {
+    case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
+    case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
+    case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier)))
+  }
+
+  private val transformMatchFunc = (text: String, regex: Regex, transform: Regex.Match => String) => matchStrategy match {
+    case MATCH_ALL => regex.replaceAllIn(text, transform)
+    case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text)
+    case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m =>
+      regex.replaceFirstIn(text, transform(m))).getOrElse(text)
+    case _ => throw new IllegalArgumentException("Invalid match strategy")
+  }
+
+  private val transformWithSymbolFunc = (text: String, symbol: String) => transformStrategy match {
+    case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
+        () => rule.identifier,
+        () => APPEND_WITH_SYMBOL)
+      "$0" + symbol
+    }))
+    case PREPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
+        () => rule.identifier,
+        () => PREPEND_WITH_SYMBOL)
+      symbol + "$0"
+    }))
+    case REPLACE_ALL_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+        () => rule.identifier,
+        () => REPLACE_ALL_WITH_SYMBOL)
+      symbol
+    }))
+    case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+        () => rule.identifier,
+        () => REPLACE_WITH_SYMBOL_AND_BREAK)
+      symbol + BREAK_INDICATOR
+    }))
+    case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
+  }
+
+  private val transformWithSymbolicRulesFunc = (text: String) => transformStrategy match {
+    case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldRight(text)((rule, target) => transformMatch(target, rule._2.regex)({ m =>
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+        () => rule._2.identifier,
+        () => REPLACE_EACH_WITH_SYMBOL)
+      rule._1
+    }))
+    case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldRight(text)((rule, target) => rule._2.regex replaceAllIn(
+      target, m => {
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+        () => rule._2.identifier,
+        () => REPLACE_EACH_WITH_SYMBOL_AND_BREAK)
+      rule._1 + BREAK_INDICATOR
+    }))
+    case PROTECT_FROM_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
+      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
+        () => m.matched,
+        () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
+        () => rule.identifier,
+        () => PROTECT_FROM_BREAK)
+      PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE
+    }))
+    case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
+  }
+
   /**
     * Adds a rule and its associated symbol to apply some transformation using such symbol
     * @param symbol symbol is a character to be used in a transformation application, where many rules can apply different transformations
@@ -75,11 +154,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
 
   /**Applies factory match strategy to find matches and returns any number of Matches*/
   def findMatch(text: String): Seq[RuleMatch] = {
-    matchStrategy match {
-      case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
-      case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
-      case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier)))
-    }
+    findMatchFunc(text)
   }
 
   /** Specifically finds a first match within a group of matches */
@@ -93,13 +168,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
     * @return Resulting transformation
     */
   private def transformMatch(text: String, regex: Regex)(transform: Regex.Match => String): String = {
-    matchStrategy match {
-      case MATCH_ALL => regex.replaceAllIn(text, transform)
-      case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text)
-      case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m =>
-        regex.replaceFirstIn(text, transform(m))).getOrElse(text)
-      case _ => throw new IllegalArgumentException("Invalid match strategy")
-    }
+    transformMatchFunc(text: String, regex: Regex, transform: Regex.Match => String)
   }
 
   /**
@@ -109,41 +178,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
     * @return
     */
   def transformWithSymbol(symbol: String, text: String): String = {
-    transformStrategy match {
-      case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
-          () => rule.identifier,
-          () => APPEND_WITH_SYMBOL)
-        "$0" + symbol
-      }))
-      case PREPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
-          () => rule.identifier,
-          () => PREPEND_WITH_SYMBOL)
-        symbol + "$0"
-      }))
-      case REPLACE_ALL_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
-          () => rule.identifier,
-          () => REPLACE_ALL_WITH_SYMBOL)
-        symbol
-      }))
-      case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
-          () => rule.identifier,
-          () => REPLACE_WITH_SYMBOL_AND_BREAK)
-          symbol + BREAK_INDICATOR
-        }))
-      case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
-    }
+    transformWithSymbolFunc(symbol, text)
   }
 
   /**
@@ -152,34 +187,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
     * @return Returns a transformed text
     */
   def transformWithSymbolicRules(text: String): String = {
-    transformStrategy match {
-      case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldRight(text)((rule, target) => transformMatch(target, rule._2.regex)({ m =>
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
-          () => rule._2.identifier,
-          () => REPLACE_EACH_WITH_SYMBOL)
-        rule._1
-      }))
-      case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldRight(text)((rule, target) => rule._2.regex replaceAllIn(
-        target, m => {
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
-          () => rule._2.identifier,
-          () => REPLACE_EACH_WITH_SYMBOL_AND_BREAK)
-        rule._1 + BREAK_INDICATOR
-      }))
-      case PROTECT_FROM_BREAK => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
-        logger.debug("Matched: {} from: {} using rule {} with strategy {}",
-          () => m.matched,
-          () => m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
-          () => rule.identifier,
-          () => PROTECT_FROM_BREAK)
-        PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE
-      }))
-      case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
-    }
+    transformWithSymbolicRulesFunc(text)
   }
 }
 object RuleFactory {
diff --git a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala
index 5d2c5a2fe0a561..d103c1f89b122f 100644
--- a/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala
+++ b/src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala
@@ -1,12 +1,12 @@
 package com.johnsnowlabs.ml.crf
 
 import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
 import com.johnsnowlabs.nlp.annotators.common.Annotated.{NerTaggedSentence, PosTaggedSentence}
 import com.johnsnowlabs.nlp.annotators.common.{NerTagged, PosTagged, TaggedSentence}
 import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach
 import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
 import com.johnsnowlabs.nlp.datasets.CoNLL
 import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat
 import org.apache.spark.ml.{PipelineModel, PipelineStage}
@@ -30,12 +30,12 @@ object CoNLL2003PipelineTest extends App {
       .setInputCol("text")
       .setOutputCol("document")
 
-    val sentenceDetector = new SentenceDetectorModel()
+    val sentenceDetector = new SentenceDetector()
       .setCustomBoundChars(Array("\n\n"))
       .setInputCols(Array("document"))
       .setOutputCol("sentence")
 
-    val tokenizer = new RegexTokenizer()
+    val tokenizer = new Tokenizer()
       .setInputCols(Array("document"))
       .setOutputCol("token")
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
index 0e67f45913aa83..6a7ce1197a81bf 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
@@ -2,10 +2,10 @@ package com.johnsnowlabs.nlp
 
 import com.johnsnowlabs.nlp.annotators._
 import com.johnsnowlabs.nlp.annotators.ner.crf.{NerCrfApproach, NerCrfModel}
-import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParser
+import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach
 import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector
 import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach
 import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach
 import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat
@@ -26,7 +26,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
   }
 
   def withTokenizer(dataset: Dataset[Row]): Dataset[Row] = {
-    val regexTokenizer = new RegexTokenizer()
+    val regexTokenizer = new Tokenizer()
       .setInputCols(Array("sentence"))
       .setOutputCol("token")
     regexTokenizer.transform(withFullPragmaticSentenceDetector(dataset))
@@ -75,7 +75,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
   }
 
   def withFullPragmaticSentenceDetector(dataset: Dataset[Row]): Dataset[Row] = {
-    val sentenceDetector = new SentenceDetectorModel()
+    val sentenceDetector = new SentenceDetector()
       .setInputCols(Array("document"))
       .setOutputCol("sentence")
     sentenceDetector.transform(dataset)
@@ -109,7 +109,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
   }
 
   def withPragmaticSentimentDetector(dataset: Dataset[Row]): Dataset[Row] = {
-    val sentimentDetector = new SentimentDetectorModel
+    val sentimentDetector = new SentimentDetector
     sentimentDetector
       .setInputCols(Array("token", "sentence"))
       .setOutputCol("sentiment")
@@ -139,7 +139,7 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
 
   def withDependencyParser(dataset: Dataset[Row]): Dataset[Row] = {
     val df = withFullPOSTagger(withTokenizer(dataset))
-    new DependencyParser()
+    new DependencyParserApproach()
       .setInputCols(Array("sentence", "pos", "token"))
       .setOutputCol("dependency")
       .setSourcePath("src/test/resources/models/dep-model.txt")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala
index 0e09cd6313f09c..99c8f306a69f45 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/FinisherTestSpec.scala
@@ -1,6 +1,6 @@
 package com.johnsnowlabs.nlp
 
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.feature.StopWordsRemover
 import org.scalatest._
@@ -14,7 +14,7 @@ class FinisherTestSpec extends FlatSpec {
     .setInputCol("text")
     .setOutputCol("document")
 
-  val tokenizer = new RegexTokenizer()
+  val tokenizer = new Tokenizer()
     .setInputCols(Array("document"))
     .setOutputCol("token")
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala
index 0ed89436902aa4..92ab5cf6aca237 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/LemmatizerTestSpec.scala
@@ -1,6 +1,6 @@
 package com.johnsnowlabs.nlp.annotators
 
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
 import com.johnsnowlabs.nlp._
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.sql.types.ArrayType
@@ -43,11 +43,11 @@ class LemmatizerTestSpec extends FlatSpec with LemmatizerBehaviors {
       .setInputCol("text")
       .setOutputCol("document")
 
-    val sentenceDetector = new SentenceDetectorModel()
+    val sentenceDetector = new SentenceDetector()
       .setInputCols(Array("document"))
       .setOutputCol("sentence")
 
-    val tokenizer = new RegexTokenizer()
+    val tokenizer = new Tokenizer()
       .setInputCols(Array("sentence"))
       .setOutputCol("token")
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
index 2aa10425d7d7d2..4915ffa57e018e 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
@@ -1,6 +1,6 @@
 package com.johnsnowlabs.nlp.annotators
 
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
 import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder, AnnotatorType}
 import org.apache.spark.sql.{Dataset, Row}
 import org.scalatest._
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
similarity index 87%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 39fc39e98cd48f..f9947c29654b87 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -8,9 +8,9 @@ import java.util.Date
 /**
   * Created by saif on 02/05/17.
   */
-class RegexTokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
+class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
 
-  val regexTokenizer = new RegexTokenizer
+  val regexTokenizer = new Tokenizer
 
   "a RegexTokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
     assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN)
@@ -25,7 +25,7 @@ class RegexTokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
 
     val assembled = documentAssembler.transform(data)
 
-    val tokenizer = new RegexTokenizer()
+    val tokenizer = new Tokenizer()
       .setOutputCol("token")
     val tokenized = tokenizer.transform(assembled)
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala
similarity index 59%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala
index e73d3b1cfaf313..879f09b16c9d36 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModelTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachModelTest.scala
@@ -2,6 +2,6 @@ package com.johnsnowlabs.nlp.annotators.parser.dep
 
 import org.scalatest.FlatSpec
 
-class DependencyParserModelTest extends FlatSpec {
+class DependencyParserApproachModelTest extends FlatSpec {
 
 }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
similarity index 97%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
index 2918a42e37254c..c76e272bf94e5e 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
@@ -5,7 +5,7 @@ import org.apache.spark.sql.Row
 import org.scalatest.FlatSpec
 import scala.language.reflectiveCalls
 
-class DependencyParserTest extends FlatSpec {
+class DependencyParserApproachTest extends FlatSpec {
   def fixture = new {
     val df = AnnotatorBuilder.withDependencyParser(DataBuilder.basicDataBuild(ContentProvider.depSentence))
     val dependencies = df.select("dependency")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
index 230d883894a6fc..51ea37f3235051 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
@@ -1,6 +1,6 @@
 package com.johnsnowlabs.nlp.annotators.pos.perceptron
 
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
 import com.johnsnowlabs.nlp.annotators.common.Sentence
 import com.johnsnowlabs.nlp.{ContentProvider, DataBuilder}
 import org.scalatest._
@@ -31,7 +31,7 @@ class PerceptronApproachTestSpec extends FlatSpec with PerceptronApproachBehavio
       sentence
     }
 
-    new RegexTokenizer().tag(sentences).toArray
+    new Tokenizer().tag(sentences).toArray
   }
 
   "an isolated perceptron tagger" should behave like isolatedPerceptronTagging(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala
index b119c6cd033982..977fd8076db05c 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticApproachTestSpec.scala
@@ -1,6 +1,6 @@
 package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
 
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
 import com.johnsnowlabs.nlp._
 import org.apache.spark.storage.StorageLevel
 import org.scalatest._
@@ -29,14 +29,14 @@ class PragmaticApproachBigTestSpec extends FlatSpec {
     val documentAssembler = new DocumentAssembler()
       .setInputCol("text")
 
-    val sentenceDetector = new SentenceDetectorModel()
+    val sentenceDetector = new SentenceDetector()
       .setOutputCol("my_sbd_sentences")
 
     val assembled = documentAssembler.transform(mergedSentences)
 
     val sentenced = sentenceDetector.transform(assembled)
 
-    val tokenizedFromDisk = new RegexTokenizer()
+    val tokenizedFromDisk = new Tokenizer()
       .setInputCols(Array("my_sbd_sentences"))
       .setOutputCol("token")
 
@@ -97,11 +97,11 @@ class PragmaticApproachTestSpec extends FlatSpec with PragmaticDetectionBehavior
   )
 
   "A Pragmatic SBD" should "be readable and writable" taggedAs Tag("LinuxOnly") in {
-    val pragmaticDetector = new SentenceDetectorModel()
+    val pragmaticDetector = new SentenceDetector()
     val path = "./test-output-tmp/pragmaticdetector"
     try {
       pragmaticDetector.write.overwrite.save(path)
-      val pragmaticDetectorRead = SentenceDetectorModel.read.load(path)
+      val pragmaticDetectorRead = SentenceDetector.read.load(path)
     } catch {
       case _: java.io.IOException => succeed
     }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
similarity index 93%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
index 9683fc7518e7df..8aca483f910e7a 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
@@ -4,7 +4,7 @@ import com.johnsnowlabs.nlp.annotators.common.Sentence
 import org.scalatest.FlatSpec
 
 
-class SentenceDetectorModelBoundsSpec extends FlatSpec {
+class SentenceDetectorBoundsSpec extends FlatSpec {
 
   val model = new PragmaticMethod(false)
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala
index 5b241cb7443c06..7ca5bb9ebfa316 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentBehaviors.scala
@@ -16,7 +16,7 @@ trait PragmaticSentimentBehaviors { this: FlatSpec =>
 
   def isolatedSentimentDetector(tokenizedSentences: Array[TokenizedSentence], expectedScore: Double): Unit = {
     s"tagged sentences" should s"have an expected score of $expectedScore" in {
-      val pragmaticScorer = new PragmaticScorer(SentimentDetectorModel.retrieveSentimentDict("/sentiment-corpus/default-sentiment-dict.txt", "txt", ","))
+      val pragmaticScorer = new PragmaticScorer(SentimentDetector.retrieveSentimentDict("/sentiment-corpus/default-sentiment-dict.txt", "txt", ","))
       val result = pragmaticScorer.score(tokenizedSentences)
       assert(result == expectedScore, s"because result: $result did not match expected: $expectedScore")
     }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala
index e9d3dbb59ae2d6..743cfb2494ce61 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/PragmaticSentimentTestSpec.scala
@@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.annotators.sda.pragmatic
 
 import com.johnsnowlabs.nlp.annotators.common.Sentence
 import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.Tokenizer
 import org.apache.spark.storage.StorageLevel
 import org.scalatest._
 import org.scalatest.tagobjects.Slow
@@ -19,7 +19,7 @@ class PragmaticSentimentBigTestSpec extends FlatSpec {
 
     val assembled = documentAssembler.transform(data)
 
-    val sentimentDetector = new SentimentDetectorModel()
+    val sentimentDetector = new SentimentDetector()
 
     val readyData = AnnotatorBuilder.withFullPOSTagger(AnnotatorBuilder.withFullLemmatizer(assembled))
     
@@ -61,7 +61,7 @@ class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehavio
     "I recommend others to avoid because it is too expensive"
 
   val sentimentSentences = {
-    new RegexTokenizer().tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray
+    new Tokenizer().tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray
   }
 
   "an isolated sentiment detector" should behave like isolatedSentimentDetector(sentimentSentences, -4.0)
@@ -72,11 +72,11 @@ class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehavio
   )
 
   "A SentimentDetector" should "be readable and writable" in {
-    val sentimentDetector = new SentimentDetectorModel()
+    val sentimentDetector = new SentimentDetector()
     val path = "./test-output-tmp/sentimentdetector"
     try {
       sentimentDetector.write.overwrite.save(path)
-      val sentimentDetectorRead = SentimentDetectorModel.read.load(path)
+      val sentimentDetectorRead = SentimentDetector.read.load(path)
       assert(sentimentDetector.model.score(sentimentSentences) == sentimentDetectorRead.model.score(sentimentSentences))
     } catch {
       case _: java.io.IOException => succeed
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala
index 942bf77725a55c..e4f4460ac70830 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentTestSpec.scala
@@ -1,8 +1,8 @@
 package com.johnsnowlabs.nlp.annotators.sda.vivekn
 
 import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer}
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer}
 import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.sql.Row
@@ -41,11 +41,11 @@ class ViveknSentimentTestSpec extends FlatSpec {
       .setInputCol("text")
       .setOutputCol("document")
 
-    val sentenceDetector = new SentenceDetectorModel()
+    val sentenceDetector = new SentenceDetector()
       .setInputCols(Array("document"))
       .setOutputCol("sentence")
 
-    val tokenizer = new RegexTokenizer()
+    val tokenizer = new Tokenizer()
       .setInputCols(Array("sentence"))
       .setOutputCol("token")
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala
index ec5d6f01b04f36..3d13d6a4aaddb1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala
@@ -1,6 +1,6 @@
 package com.johnsnowlabs.nlp.annotators.spell.norvig
 
-import com.johnsnowlabs.nlp.annotators.{Normalizer, RegexTokenizer}
+import com.johnsnowlabs.nlp.annotators.{Normalizer, Tokenizer}
 import com.johnsnowlabs.nlp._
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.sql.{Dataset, Row}
@@ -38,7 +38,7 @@ trait NorvigSweetingBehaviors { this: FlatSpec =>
         .setInputCol("text")
         .setOutputCol("document")
 
-      val tokenizer = new RegexTokenizer()
+      val tokenizer = new Tokenizer()
         .setInputCols(Array("document"))
         .setOutputCol("token")
 

From 4665a161d35d99c12a3ec795a6f95928a766d2c0 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Fri, 26 Jan 2018 16:03:22 -0300
Subject: [PATCH 2/9] - Tokenizer wip

---
 .../com/johnsnowlabs/nlp/AnnotatorModel.scala |  4 ++
 .../nlp/annotators/EntityExtractor.scala      |  2 +-
 .../nlp/annotators/Tokenizer.scala            | 68 +++++++++++++------
 .../sda/pragmatic/SentimentDetector.scala     | 24 ++-----
 .../nlp/util/io/ResourceHelper.scala          |  2 +-
 .../nlp/util/regex/RegexRule.scala            |  2 +-
 .../nlp/util/regex/RuleFactory.scala          |  7 +-
 .../nlp/annotators/TokenizerTestSpec.scala    | 21 +++++-
 8 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
index d3ed36a2daff82..4fca2ce01d9a52 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
@@ -53,6 +53,9 @@ abstract class AnnotatorModel[M <: Model[M]]
     StructType(outputFields)
   }
 
+  /** override this function if you need to reset or clear annotate variables just once before annotating */
+  def beforeAnnotate(): Unit = {}
+
   /**
     * Given requirements are met, this applies ML transformation within a Pipeline or stand-alone
     * Output annotation will be generated as a new column, previous annotations are still available separately
@@ -65,6 +68,7 @@ abstract class AnnotatorModel[M <: Model[M]]
       s"${requiredAnnotatorTypes.mkString(", ")}")
     val metadataBuilder: MetadataBuilder = new MetadataBuilder()
     metadataBuilder.putString("annotatorType", annotatorType)
+    beforeAnnotate()
     dataset.withColumn(
       getOutputCol,
       dfAnnotate(
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
index 834156aec4756c..0273cf9a6ab24a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/EntityExtractor.scala
@@ -79,7 +79,7 @@ class EntityExtractor(override val uid: String) extends AnnotatorModel[EntityExt
   private def loadEntities(): Unit = {
     val src = EntityExtractor.retrieveEntityExtractorPhrases($(entitiesPath), $(entitiesFormat))
 
-    val tokenizer = new Tokenizer().setPattern("\\w+")
+    val tokenizer = new Tokenizer()
     val normalizer = new Normalizer()
     val phrases: Array[Array[String]] = src.map {
       line =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index dd2af41126988b..610dd43d48ecb4 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -1,7 +1,8 @@
 package com.johnsnowlabs.nlp.annotators
 
 import com.johnsnowlabs.nlp.annotators.common._
-import org.apache.spark.ml.param.Param
+import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
+import org.apache.spark.ml.param.{Param, StringArrayParam}
 import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 
@@ -16,44 +17,71 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   import com.johnsnowlabs.nlp.AnnotatorType._
 
-  val pattern: Param[String] = new Param(this, "pattern", "this is the token pattern")
-
-  lazy val regex: Regex = $(pattern).r
+  val wordPattern: Param[String] = new Param(this, "wordPattern", "this is the base word pattern. Defaults \\w+")
+  val extensionPattern: StringArrayParam = new StringArrayParam(this, "infixPattern", "infix patterns allow for word exceptions that count as single token. E.g. U.S.A. Defaults ")
+  val prefixPattern: StringArrayParam = new StringArrayParam(this, "prefixPattern", "this is the token pattern")
+  val suffixPattern: StringArrayParam = new StringArrayParam(this, "suffixPattern", "this is the token pattern")
 
   override val annotatorType: AnnotatorType = TOKEN
 
+  /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */
+  override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
+
   def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
 
-  def setPattern(value: String): this.type = set(pattern, value)
+  def setWordPattern(value: String): this.type = set(wordPattern, value)
 
-  def addPattern(value: String)
+  def setExtensionPattern(value: Array[String]): this.type = set(extensionPattern, value)
 
-  def setPrefixPattern(value: String)
+  def addExtensionPattern(value: String): this.type = set(extensionPattern, $(extensionPattern) :+ value)
 
-  def setSuffixPattern(value: String)
+  def setPrefixPattern(value: Array[String]): this.type = set(prefixPattern, value)
 
-  def addPrefixPattern(value: String)
+  def addPrefixPattern(value: String): this.type = set(prefixPattern, $(prefixPattern) :+ value)
 
-  def addSuffixPattern(value: String)
+  def setSuffixPattern(value: Array[String]): this.type = set(suffixPattern, value)
 
-  def getPattern: String = $(pattern)
+  def addSuffixPattern(value: String): this.type = set(suffixPattern, $(suffixPattern) :+ value)
 
-  def getPrefixPattern: String
+  def getWordPattern: String = $(wordPattern)
 
-  def getSuffixPattern: String
+  def getInfixPattern: Array[String] = $(extensionPattern)
 
-  setDefault(inputCols, Array(DOCUMENT))
+  def getPrefixPattern: Array[String] = $(prefixPattern)
 
-  /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */
-  override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
+  def getSuffixPattern: Array[String] = $(suffixPattern)
 
-  setDefault(pattern, "([^\\s\\w]?)(\\w+(?:\\.\\w{1}\\.|(?:\\-\\w+)*)?)([^\\s\\w]?)")
+  setDefault(inputCols, Array(DOCUMENT))
+
+  setDefault(wordPattern, "\\w+")
+  setDefault(extensionPattern, Array("\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*"))
+  setDefault(prefixPattern, Array("([^\\s\\w]?)"))
+  setDefault(suffixPattern, Array("([^\\s\\w]?)"))
+
+  val ruleFactory = new RuleFactory(MatchStrategy.MATCH_ALL)
+
+  override def beforeAnnotate(): Unit = {
+    /** Clears out rules and constructs a new rule for every combination of rules provided */
+    /** The strategy is to catch one token per regex group */
+    /** User may add its own groups if needs targets to be tokenized separately from the rest */
+    /** "([^\\s\\w]?)(\\w+(?:\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*)?)([^\\s\\w]?)" */
+    ruleFactory
+      .clearRules()
+    $(prefixPattern).foreach(pp => $(suffixPattern).foreach (sp => $(extensionPattern).foreach(ep => {
+      ruleFactory.addRule(
+        (pp + "(" + $(wordPattern) + "(?:" + ep + ")?" + ")" + sp).r,
+        "tokenizer construction pattern"
+      )
+    })))
+  }
 
   def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
     sentences.map{text =>
-      val tokens = regex.findAllMatchIn(text.content).flatMap { m =>
-        (1 to m.groupCount).map (i => IndexedToken(m.group(i), text.begin + m.start, text.begin + m.end - 1))
-      }.toArray
+      val tokens = ruleFactory.findMatch(text.content).flatMap { m =>
+        (1 to m.content.groupCount)
+          .map (i => IndexedToken(m.content.group(i), text.begin + m.content.start, text.begin + m.content.end - 1))
+      }.filter(t => t.token.nonEmpty).toArray
+      tokens.foreach(t => println(t.token))
       TokenizedSentence(tokens)
     }
   }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
index d3da8db68b1960..0fa6362eb45650 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/pragmatic/SentimentDetector.scala
@@ -44,29 +44,17 @@ class SentimentDetector(override val uid: String) extends AnnotatorModel[Sentime
 
   def this() = this(Identifiable.randomUID("SENTIMENT"))
 
-  def setDictPath(path: String): this.type = {
-    set(dictPath, path)
-  }
+  def setDictPath(path: String): this.type = set(dictPath, path)
 
-  def getDictPath: String = {
-    $(dictPath)
-  }
+  def getDictPath: String = $(dictPath)
 
-  def setDictFormat(format: String): this.type = {
-    set(dictFormat, format)
-  }
+  def setDictFormat(format: String): this.type = set(dictFormat, format)
 
-  def getDictFormat: String = {
-    $(dictFormat)
-  }
+  def getDictFormat: String = $(dictFormat)
 
-  def setDictSeparator(separator: String): this.type = {
-    set(dictSeparator, separator)
-  }
+  def setDictSeparator(separator: String): this.type = set(dictSeparator, separator)
 
-  def getDictSeparator: String = {
-    $(dictSeparator)
-  }
+  def getDictSeparator: String = $(dictSeparator)
 
   /**
     * Tokens are needed to identify each word in a sentence boundary
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
index f0a72955a66fd2..a3d4837011d899 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
@@ -370,7 +370,7 @@ object ResourceHelper {
         val tokenizer = new Tokenizer()
           .setInputCols("document")
           .setOutputCol("token")
-          .setPattern(tokenPattern)
+          .setWordPattern(tokenPattern)
         val normalizer = new Normalizer()
           .setInputCols("token")
           .setOutputCol("normal")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala
index df81da027773fd..cb3428feafbb55 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RegexRule.scala
@@ -7,7 +7,7 @@ import scala.util.matching.Regex
   * @param rx a java.matching.Regex object
   * @param identifier some description that might help link the regex to its meaning
   */
-class RegexRule(rx: Regex, val identifier: String) {
+class RegexRule(rx: Regex, val identifier: String) extends Serializable {
   def this(rx: String, identifier: String) {
     this(rx.r, identifier)
   }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
index f8b8f12d411493..f7a544d9b1d650 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
@@ -16,7 +16,7 @@ import scala.util.matching.Regex
   */
 class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
                   transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM)
-  extends RuleSymbols {
+  extends RuleSymbols with Serializable {
 
   /**
     * Internal representation of a regex match
@@ -49,6 +49,11 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
     this
   }
 
+  def clearRules(): this.type = {
+    rules = Seq.empty[RegexRule]
+    this
+  }
+
   /** Shortcut functions, no need to execute them on runtime since a strategy won't change in lifetime of Factory */
   private val findMatchFunc = (text: String) => matchStrategy match {
     case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index f9947c29654b87..10b7786142e756 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -5,6 +5,9 @@ import org.apache.spark.sql.{Dataset, Row}
 import org.scalatest._
 import java.util.Date
 
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import org.apache.spark.ml.Pipeline
+
 /**
   * Created by saif on 02/05/17.
   */
@@ -12,10 +15,26 @@ class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
 
   val regexTokenizer = new Tokenizer
 
-  "a RegexTokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
+  "a Tokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
     assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN)
   }
 
+  "a Tokenizer" should "correctly tokenize target text on its defaults parameters" in {
+    val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
+    import data.sparkSession.implicits._
+    val tokenizer = new Tokenizer().setInputCols("text").setOutputCol("token")
+    val sentence = new SentenceDetector().setInputCols("token").setOutputCol("sentence")
+    val finisher = new Finisher().setInputCols("sentence")//.setOutputAsArray(true)
+    val pipeline = new Pipeline().setStages(Array(tokenizer, sentence, finisher))
+    pipeline.fit(data).transform(data).select("finished_sentence").show
+    assert(pipeline.fit(data).transform(data).select("output").as[Array[String]]
+      .collect
+      .sameElements(Array(
+        "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+        "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crasus", ".")
+      ))
+  }
+
   "a spark based tokenizer" should "resolve big data" in {
     val data = ContentProvider.parquetData.limit(500000)
       .repartition(16)

From 4f9ff021b173aec68ac69f15f2872cb52dfaf897 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Fri, 26 Jan 2018 16:48:11 -0300
Subject: [PATCH 3/9] - Tokenizer testing use cases

---
 docs/components.html                          | 22 +++----
 docs/notebooks.html                           |  4 +-
 docs/quickstart.html                          | 10 +--
 python/example/crf-ner/ner.ipynb              |  4 +-
 python/example/crf-ner/ner_benchmark.ipynb    |  4 +-
 .../dictionary-sentiment/sentiment.ipynb      |  6 +-
 .../entities-extractor/extractor.ipynb        |  4 +-
 .../example/vivekn-sentiment/sentiment.ipynb  | 65 +++++++++++++++----
 python/sparknlp/annotator.py                  |  6 +-
 python/test/annotators.py                     | 24 +++----
 .../nlp/annotators/Tokenizer.scala            |  8 +--
 ...haviors.scala => TokenizerBehaviors.scala} | 12 ++--
 .../nlp/annotators/TokenizerTestSpec.scala    | 51 +++++++++++----
 .../SentenceDetectorBoundsSpec.scala          |  4 +-
 14 files changed, 146 insertions(+), 78 deletions(-)
 rename src/test/scala/com/johnsnowlabs/nlp/annotators/{RegexTokenizerBehaviors.scala => TokenizerBehaviors.scala} (80%)

diff --git a/docs/components.html b/docs/components.html
index d191942d012412..57314aba919844 100644
--- a/docs/components.html
+++ b/docs/components.html
@@ -172,7 +172,7 @@ <h4 id="DocumentAssembler" class="section-block"> 1. DocumentAssembler: Getting
                                         </div><!--//code-block--></div>
                                 </div>
 
-                                <h4 id="RegexTokenizer" class="section-block">2. RegexTokenizer: Word tokens</h4>
+                                <h4 id="Tokenizer" class="section-block">2. Tokenizer: Word tokens</h4>
                                 <ul class="nav nav-tabs" role="tablist">
                                     <li role="presentation" class="active"><a href="#python" aria-controls="home"
                                                                               role="tab" data-toggle="tab">Python</a>
@@ -197,7 +197,7 @@ <h4 id="RegexTokenizer" class="section-block">2. RegexTokenizer: Word tokens</h4
                                             <br>
                                             <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">tokenizer = RegexTokenizer() \
+                                            <pre><code class="language-python">tokenizer = Tokenizer() \
   .setInputCols(["sentences"]) \
   .setOutputCol("token")</code></pre>
                                         </div><!--//code-block-->
@@ -218,7 +218,7 @@ <h4 id="RegexTokenizer" class="section-block">2. RegexTokenizer: Word tokens</h4
                                             <br>
                                             <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">val regexTokenizer = new RegexTokenizer()
+                                            <pre><code class="language-python">val regexTokenizer = new Tokenizer()
   .setInputCols("sentence")
   .setOutputCol("token")</code></pre>
                                         </div><!--//code-block-->
@@ -653,7 +653,7 @@ <h4 id="SentenceDetector" class="section-block"> 9. SentenceDetector: Sentence B
                                             </ul>
                                                 <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">sentence_detector = SentenceDetectorModel() \
+                                            <pre><code class="language-python">sentence_detector = SentenceDetector() \
   .setInputCols(["document"]) \
   .setOutputCol("sentence") \
   .setUseAbbreviations(True)</code></pre>
@@ -673,7 +673,7 @@ <h4 id="SentenceDetector" class="section-block"> 9. SentenceDetector: Sentence B
                                           </ul>
                                                 <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">val sentenceDetector = new SentenceDetectorModel()
+                                            <pre><code class="language-python">val sentenceDetector = new SentenceDetector()
   .setInputCols("document")
   .setOutputCol("sentence")</code></pre>
                                         </div>
@@ -790,7 +790,7 @@ <h4 id="SentimentDetector" class="section-block"> 11. SentimentDetector: Sentime
                                             <br>
                                             <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">sentiment_detector = SentimentDetectorModel() \
+                                            <pre><code class="language-python">sentiment_detector = SentimentDetector() \
   .setInputCols(["lemma", "sentence"]) \
   .setOutputCol("sentiment")</code></pre>
                                         </div><!--//code-block-->
@@ -825,7 +825,7 @@ <h4 id="SentimentDetector" class="section-block"> 11. SentimentDetector: Sentime
                                             <br>
                                             <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">val sentimentDetector = new SentimentDetectorModel
+                                            <pre><code class="language-python">val sentimentDetector = new SentimentDetector
   .setInputCols(Array("token", "sentence"))
   .setOutputCol("sentiment")</code></pre>
                                         </div><!--//code-block--></div>
@@ -902,7 +902,7 @@ <h4 id="SpellChecker" class="section-block"> 13. SpellChecker: Token spell
                                                 <b>Inputs:</b> Any text for corpus. A list of words for dictionary. A
                                                 comma
                                                 separated custom dictionary.<br>
-                                                <b>Requires:</b> RegexTokenizer<br>
+                                                <b>Requires:</b> Tokenizer<br>
                                                 <b>Functions:</b><br>
                                             <ul>
                                                 <li>
@@ -947,7 +947,7 @@ <h4 id="SpellChecker" class="section-block"> 13. SpellChecker: Token spell
                                                 <b>Inputs:</b> Any text for corpus. A list of words for dictionary. A
                                                 comma
                                                 separated custom dictionary.<br>
-                                                <b>Requires:</b> RegexTokenizer<br>
+                                                <b>Requires:</b> Tokenizer<br>
                                                 <b>Functions:</b><br>
                                             <ul>
                                                 <li>
@@ -1017,7 +1017,7 @@ <h4 id="ViveknSentimentDetector" class="section-block"> 14. ViveknSentimentDetec
                                             <b>Input:</b> File or folder of text files of positive and negative data<br>
                                             <b>Example:</b><br>
                                             </p>
-                                            <pre><code class="language-python">sentiment_detector = SentimentDetectorModel() \
+                                            <pre><code class="language-python">sentiment_detector = SentimentDetector() \
     .setInputCols(["lemma", "sentence"]) \
     .setOutputCol("sentiment")</code></pre>
                                         </div><!--//code-block-->
@@ -1225,7 +1225,7 @@ <h4 id="TokenAssembler" class="section-block"> 16. TokenAssembler: Getting data
                                 <a class="scrollto" href="#code-section">Annotators</a>
                                 <ul class="nav doc-sub-menu">
                                     <li><a class="scrollto" href="#DocumentAssembler">Document Assembler</a></li>
-                                    <li><a class="scrollto" href="#RegexTokenizer">Regex Tokenizer</a></li>
+                                    <li><a class="scrollto" href="#Tokenizer">Regex Tokenizer</a></li>
                                     <li><a class="scrollto" href="#Normalizer">Normalizer</a></li>
                                     <li><a class="scrollto" href="#Stemmer">Stemmer</a></li>
                                     <li><a class="scrollto" href="#Lemmatizer">Lemmatizer</a></li>
diff --git a/docs/notebooks.html b/docs/notebooks.html
index c71203360ee048..2e888f04d5d209 100644
--- a/docs/notebooks.html
+++ b/docs/notebooks.html
@@ -115,13 +115,13 @@ <h4 id="Notebook1" class="section-block"> Sentiment Analysis using John Snow Lab
 #assembled = document_assembler.transform(data)
 
 ### Sentence detector
-sentence_detector = SentenceDetectorModel() \
+sentence_detector = SentenceDetector() \
     .setInputCols(["document"]) \
     .setOutputCol("sentence")
 #sentence_data = sentence_detector.transform(checked)
                                                                                                          In [ ]:
 ### Tokenizer
-tokenizer = RegexTokenizer() \
+tokenizer = Tokenizer() \
             .setInputCols(["sentence"]) \
             .setOutputCol("token")
 #tokenized = tokenizer.transform(assembled)
diff --git a/docs/quickstart.html b/docs/quickstart.html
index 6e14e5064e0778..14ce5a073f914e 100644
--- a/docs/quickstart.html
+++ b/docs/quickstart.html
@@ -222,19 +222,19 @@ <h3 class="block-title">Sentence detection and tokenization</h3>
                                     <p>
                                         In this quick example, we now proceed to identify the sentences in each of our
                                         document lines.
-                                        SentenceDetectorModel requires a Document annotation, which is provided by the
+                                        SentenceDetector requires a Document annotation, which is provided by the
                                         DocumentAssembler
                                         output, and it's itself a Document type token.
-                                        The RegexTokenizer requires a Document annotation type, meaning it works both
+                                        The Tokenizer requires a Document annotation type, meaning it works both
                                         with DocumentAssembler
                                         or SentenceDetector output, in here, we use the sentence output.
                                     </p>
-                                    <pre><code class="language-python">import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
-val sentenceDetector = new SentenceDetectorModel()
+                                    <pre><code class="language-python">import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+val sentenceDetector = new SentenceDetector()
     .setInputCols(Array("document"))
     .setOutputCol("sentence")
 
-val regexTokenizer = new RegexTokenizer()
+val regexTokenizer = new Tokenizer()
     .setInputCols(Array("sentence"))
     .setOutputCol("token")</code></pre>
                                 </div>
diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb
index a29bbbbb5b301c..7bf620236fa9ff 100644
--- a/python/example/crf-ner/ner.ipynb
+++ b/python/example/crf-ner/ner.ipynb
@@ -101,11 +101,11 @@
     "  .setInputCol(\"text\")\\\n",
     "  .setOutputCol(\"document\")\n",
     "\n",
-    "sentenceDetector = SentenceDetectorModel()\\\n",
+    "sentenceDetector = SentenceDetector()\\\n",
     "  .setInputCols([\"document\"])\\\n",
     "  .setOutputCol(\"sentence\")\n",
     "\n",
-    "tokenizer = RegexTokenizer()\\\n",
+    "tokenizer = Tokenizer()\\\n",
     "  .setInputCols([\"document\"])\\\n",
     "  .setOutputCol(\"token\")\n",
     "\n",
diff --git a/python/example/crf-ner/ner_benchmark.ipynb b/python/example/crf-ner/ner_benchmark.ipynb
index b63c636ebe4833..4877ef0db82e5f 100644
--- a/python/example/crf-ner/ner_benchmark.ipynb
+++ b/python/example/crf-ner/ner_benchmark.ipynb
@@ -182,11 +182,11 @@
     "      .setInputCol(\"text\")\\\n",
     "      .setOutputCol(\"document\")\n",
     "\n",
-    "    sentenceDetector = SentenceDetectorModel()\\\n",
+    "    sentenceDetector = SentenceDetector()\\\n",
     "      .setInputCols([\"document\"])\\\n",
     "      .setOutputCol(\"sentence\")\n",
     "\n",
-    "    tokenizer = RegexTokenizer()\\\n",
+    "    tokenizer = Tokenizer()\\\n",
     "      .setInputCols([\"document\"])\\\n",
     "      .setOutputCol(\"token\")\n",
     "\n",
diff --git a/python/example/dictionary-sentiment/sentiment.ipynb b/python/example/dictionary-sentiment/sentiment.ipynb
index 81264b96d533b6..0726efe0ae50f3 100644
--- a/python/example/dictionary-sentiment/sentiment.ipynb
+++ b/python/example/dictionary-sentiment/sentiment.ipynb
@@ -45,11 +45,11 @@
     "document_assembler = DocumentAssembler() \\\n",
     "    .setInputCol(\"text\")\n",
     "\n",
-    "sentence_detector = SentenceDetectorModel() \\\n",
+    "sentence_detector = SentenceDetector() \\\n",
     "    .setInputCols([\"document\"]) \\\n",
     "    .setOutputCol(\"sentence\")\n",
     "\n",
-    "tokenizer = RegexTokenizer() \\\n",
+    "tokenizer = Tokenizer() \\\n",
     "    .setInputCols([\"sentence\"]) \\\n",
     "    .setOutputCol(\"token\")\n",
     "\n",
@@ -58,7 +58,7 @@
     "    .setOutputCol(\"lemma\") \\\n",
     "    .setDictionary(\"../../../src/test/resources/lemma-corpus/AntBNC_lemmas_ver_001.txt\")\n",
     "        \n",
-    "sentiment_detector = SentimentDetectorModel() \\\n",
+    "sentiment_detector = SentimentDetector() \\\n",
     "    .setInputCols([\"lemma\", \"sentence\"]) \\\n",
     "    .setOutputCol(\"sentiment_score\") \\\n",
     "    .setDictPath(\"../../../src/test/resources/sentiment-corpus/default-sentiment-dict.txt\")\n",
diff --git a/python/example/entities-extractor/extractor.ipynb b/python/example/entities-extractor/extractor.ipynb
index 625d4523df7f41..a0d692f3bd2c1f 100644
--- a/python/example/entities-extractor/extractor.ipynb
+++ b/python/example/entities-extractor/extractor.ipynb
@@ -51,11 +51,11 @@
     "  .setInputCol(\"text\")\\\n",
     "  .setOutputCol(\"document\")\n",
     "\n",
-    "sentenceDetector = SentenceDetectorModel()\\\n",
+    "sentenceDetector = SentenceDetector()\\\n",
     "  .setInputCols([\"document\"])\\\n",
     "  .setOutputCol(\"sentence\")\n",
     "\n",
-    "tokenizer = RegexTokenizer()\\\n",
+    "tokenizer = Tokenizer()\\\n",
     "  .setInputCols([\"document\"])\\\n",
     "  .setOutputCol(\"token\")\n",
     "\n",
diff --git a/python/example/vivekn-sentiment/sentiment.ipynb b/python/example/vivekn-sentiment/sentiment.ipynb
index f453217ad1e14c..bf11f116846402 100644
--- a/python/example/vivekn-sentiment/sentiment.ipynb
+++ b/python/example/vivekn-sentiment/sentiment.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": true
    },
@@ -20,9 +20,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+------+---------+--------------------+\n",
+      "|itemid|sentiment|                text|\n",
+      "+------+---------+--------------------+\n",
+      "|     1|        0|                 ...|\n",
+      "|     2|        0|                 ...|\n",
+      "|     3|        1|              omg...|\n",
+      "|     4|        0|          .. Omga...|\n",
+      "|     5|        0|         i think ...|\n",
+      "|     6|        0|         or i jus...|\n",
+      "|     7|        1|       Juuuuuuuuu...|\n",
+      "|     8|        0|       Sunny Agai...|\n",
+      "|     9|        1|      handed in m...|\n",
+      "|    10|        1|      hmmmm.... i...|\n",
+      "|    11|        0|      I must thin...|\n",
+      "|    12|        1|      thanks to a...|\n",
+      "|    13|        0|      this weeken...|\n",
+      "|    14|        0|     jb isnt show...|\n",
+      "|    15|        0|     ok thats it ...|\n",
+      "|    16|        0|    &lt;-------- ...|\n",
+      "|    17|        0|    awhhe man.......|\n",
+      "|    18|        1|    Feeling stran...|\n",
+      "|    19|        0|    HUGE roll of ...|\n",
+      "|    20|        0|    I just cut my...|\n",
+      "+------+---------+--------------------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "#Load the input data to be annotated\n",
     "data = spark. \\\n",
@@ -36,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
@@ -59,7 +92,7 @@
    "outputs": [],
    "source": [
     "### Sentence detector\n",
-    "sentence_detector = SentenceDetectorModel() \\\n",
+    "sentence_detector = SentenceDetector() \\\n",
     "    .setInputCols([\"document\"]) \\\n",
     "    .setOutputCol(\"sentence\")\n",
     "#sentence_data = sentence_detector.transform(checked)"
@@ -74,7 +107,7 @@
    "outputs": [],
    "source": [
     "### Tokenizer\n",
-    "tokenizer = RegexTokenizer() \\\n",
+    "tokenizer = Tokenizer() \\\n",
     "            .setInputCols([\"sentence\"]) \\\n",
     "            .setOutputCol(\"token\")\n",
     "#tokenized = tokenizer.transform(assembled)"
@@ -154,7 +187,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pipeline = Pipeline(stages=[\n",
@@ -178,7 +213,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "for r in sentiment_data.take(5):\n",
@@ -188,7 +225,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "start = time.time()\n",
@@ -201,7 +240,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "start = time.time()\n",
@@ -214,7 +255,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "start = time.time()\n",
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index 63e8cf208260d1..542e2d6a52f63c 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -333,7 +333,7 @@ def __init__(self):
         self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector", self.uid)
 
 
-class SentimentDetectorModel(AnnotatorTransformer):
+class SentimentDetector(AnnotatorTransformer):
     dictPath = Param(Params._dummy(),
                      "dictPath",
                      "path for dictionary to sentiment analysis")
@@ -348,8 +348,8 @@ class SentimentDetectorModel(AnnotatorTransformer):
 
     @keyword_only
     def __init__(self):
-        super(SentimentDetectorModel, self).__init__()
-        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel", self.uid)
+        super(SentimentDetector, self).__init__()
+        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector", self.uid)
 
     def setDictPath(self, value):
         return self._set(dictPath=value)
diff --git a/python/test/annotators.py b/python/test/annotators.py
index e42745ffb7438b..b0ec7d74d1b193 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -16,7 +16,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        tokenizer = RegexTokenizer()\
+        tokenizer = Tokenizer()\
             .setOutputCol("token")
         stemmer = Stemmer() \
             .setInputCols(["token"]) \
@@ -65,7 +65,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setOutputCol("token")
         lemmatizer = Lemmatizer() \
             .setInputCols(["token"]) \
@@ -85,7 +85,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setOutputCol("token")
         lemmatizer = Normalizer() \
             .setInputCols(["token"]) \
@@ -121,7 +121,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setOutputCol("token")
         entity_extractor = EntityExtractor() \
             .setOutputCol("entity")
@@ -139,10 +139,10 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        sentence_detector = SentenceDetectorModel() \
+        sentence_detector = SentenceDetector() \
             .setInputCols(["document"]) \
             .setOutputCol("sentence")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setInputCols(["sentence"]) \
             .setOutputCol("token")
         pos_tagger = PerceptronApproach() \
@@ -166,7 +166,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        sentence_detector = SentenceDetectorModel() \
+        sentence_detector = SentenceDetector() \
             .setInputCols(["document"]) \
             .setOutputCol("sentence") \
             .setCustomBounds(["%%"])
@@ -183,17 +183,17 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        sentence_detector = SentenceDetectorModel() \
+        sentence_detector = SentenceDetector() \
             .setInputCols(["document"]) \
             .setOutputCol("sentence")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setInputCols(["sentence"]) \
             .setOutputCol("token")
         lemmatizer = Lemmatizer() \
             .setInputCols(["token"]) \
             .setOutputCol("lemma") \
             .setDictionary({"missed": "miss"})
-        sentiment_detector = SentimentDetectorModel() \
+        sentiment_detector = SentimentDetector() \
             .setInputCols(["lemma", "sentence"]) \
             .setOutputCol("sentiment") \
             .setDictPath("../src/test/resources/sentiment-corpus/default-sentiment-dict.txt")
@@ -213,7 +213,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setOutputCol("token")
         lemmatizer = Lemmatizer() \
             .setInputCols(["token"]) \
@@ -248,7 +248,7 @@ def runTest(self):
         document_assembler = DocumentAssembler() \
             .setInputCol("text") \
             .setOutputCol("document")
-        tokenizer = RegexTokenizer() \
+        tokenizer = Tokenizer() \
             .setOutputCol("token")
         spell_checker = NorvigSweetingApproach() \
             .setInputCols(["token"]) \
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index 610dd43d48ecb4..f08bdc2ab5f75f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -24,7 +24,7 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   override val annotatorType: AnnotatorType = TOKEN
 
-  /** A RegexTokenizer could require only for now a SentenceDetectorModel annotator */
+  /** A Tokenizer could require only for now a SentenceDetector annotator */
   override val requiredAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
 
   def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
@@ -56,7 +56,7 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
   setDefault(wordPattern, "\\w+")
   setDefault(extensionPattern, Array("\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*"))
   setDefault(prefixPattern, Array("([^\\s\\w]?)"))
-  setDefault(suffixPattern, Array("([^\\s\\w]?)"))
+  setDefault(suffixPattern, Array("([^\\s\\w]?)([^\\s\\w]*)"))
 
   val ruleFactory = new RuleFactory(MatchStrategy.MATCH_ALL)
 
@@ -64,7 +64,8 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
     /** Clears out rules and constructs a new rule for every combination of rules provided */
     /** The strategy is to catch one token per regex group */
     /** User may add its own groups if needs targets to be tokenized separately from the rest */
-    /** "([^\\s\\w]?)(\\w+(?:\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*)?)([^\\s\\w]?)" */
+    /** "([^\s\w]?)(\w+(?:\.(?:\w{1}\.)+|(?:\-\w+)*)?)([^\s\w]?)([\s\w]*)" */
+    /** */
     ruleFactory
       .clearRules()
     $(prefixPattern).foreach(pp => $(suffixPattern).foreach (sp => $(extensionPattern).foreach(ep => {
@@ -81,7 +82,6 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
         (1 to m.content.groupCount)
           .map (i => IndexedToken(m.content.group(i), text.begin + m.content.start, text.begin + m.content.end - 1))
       }.filter(t => t.token.nonEmpty).toArray
-      tokens.foreach(t => println(t.token))
       TokenizedSentence(tokens)
     }
   }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
similarity index 80%
rename from src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
rename to src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
index 4915ffa57e018e..bdb47d64f953a3 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
@@ -7,10 +7,10 @@ import org.scalatest._
 
 import scala.language.reflectiveCalls
 
-trait RegexTokenizerBehaviors { this: FlatSpec =>
+trait TokenizerBehaviors { this: FlatSpec =>
 
   def fixture(dataset: => Dataset[Row]) = new {
-    val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withFullPragmaticSentenceDetector(dataset))
+    val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withTokenizer(dataset))
     val documents = df.select("document")
     val sentences = df.select("sentence")
     val tokens = df.select("token")
@@ -34,16 +34,16 @@ trait RegexTokenizerBehaviors { this: FlatSpec =>
   }
 
   def fullTokenizerPipeline(dataset: => Dataset[Row]) {
-    "A RegexTokenizer Annotator" should "successfully transform data" in {
+    "A Tokenizer Annotator" should "successfully transform data" in {
       val f = fixture(dataset)
-      assert(f.tokensAnnotations.nonEmpty, "RegexTokenizer should add annotators")
+      assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
     }
 
     it should "annotate using the annotatorType of token" in {
       val f = fixture(dataset)
-      assert(f.tokensAnnotations.nonEmpty, "RegexTokenizer should add annotators")
+      assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
       f.tokensAnnotations.foreach { a =>
-        assert(a.annotatorType == AnnotatorType.TOKEN, "RegexTokenizer annotations type should be equal to 'token'")
+        assert(a.annotatorType == AnnotatorType.TOKEN, "Tokenizer annotations type should be equal to 'token'")
       }
     }
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 10b7786142e756..3f692ee79517d8 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -11,7 +11,7 @@ import org.apache.spark.ml.Pipeline
 /**
   * Created by saif on 02/05/17.
   */
-class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
+class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
 
   val regexTokenizer = new Tokenizer
 
@@ -22,17 +22,42 @@ class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
   "a Tokenizer" should "correctly tokenize target text on its defaults parameters" in {
     val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
     import data.sparkSession.implicits._
-    val tokenizer = new Tokenizer().setInputCols("text").setOutputCol("token")
-    val sentence = new SentenceDetector().setInputCols("token").setOutputCol("sentence")
-    val finisher = new Finisher().setInputCols("sentence")//.setOutputAsArray(true)
-    val pipeline = new Pipeline().setStages(Array(tokenizer, sentence, finisher))
-    pipeline.fit(data).transform(data).select("finished_sentence").show
-    assert(pipeline.fit(data).transform(data).select("output").as[Array[String]]
-      .collect
-      .sameElements(Array(
-        "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
-        "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crasus", ".")
-      ))
+    val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+    val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token")
+    val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+    val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
+    val expected = Array(
+      "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+      "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
+    )
+    val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+      .collect.flatten
+    assert(
+      result.sameElements(expected),
+      s"because result tokens differ: " +
+        s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
+    )
+  }
+
+  "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters" in {
+    val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
+    import data.sparkSession.implicits._
+    val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+    val sentence = new SentenceDetector().setInputCols("document").setOutputCol("sentence")
+    val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token")
+    val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+    val pipeline = new Pipeline().setStages(Array(document, sentence, tokenizer, finisher))
+    val expected = Array(
+      "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+      "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
+    )
+    val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+      .collect.flatten
+    assert(
+      result.sameElements(expected),
+      s"because result tokens differ: " +
+        s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
+    )
   }
 
   "a spark based tokenizer" should "resolve big data" in {
@@ -55,6 +80,6 @@ class TokenizerTestSpec extends FlatSpec with RegexTokenizerBehaviors {
 
   val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)
 
-  "A full RegexTokenizer pipeline with latin content" should behave like fullTokenizerPipeline(latinBodyData)
+  "A full Tokenizer pipeline with latin content" should behave like fullTokenizerPipeline(latinBodyData)
 
 }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
index 8aca483f910e7a..b1f053161ab7a3 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala
@@ -8,7 +8,7 @@ class SentenceDetectorBoundsSpec extends FlatSpec {
 
   val model = new PragmaticMethod(false)
 
-  "SentenceDetectorModel" should "return correct sentence bounds" in {
+  "SentenceDetector" should "return correct sentence bounds" in {
     val bounds = model.extractBounds("Hello World!! New Sentence", Array.empty[String])
 
     assert(bounds.length == 2)
@@ -16,7 +16,7 @@ class SentenceDetectorBoundsSpec extends FlatSpec {
     assert(bounds(1) == Sentence("New Sentence", 14, 25))
   }
 
-  "SentenceDetectorModel" should "correct return sentence bounds with whitespaces" in {
+  "SentenceDetector" should "correct return sentence bounds with whitespaces" in {
     val bounds = model.extractBounds(" Hello World!! .  New Sentence  ", Array.empty[String])
 
     assert(bounds.length == 3)

From 5311bec2744e8ad535737436304e234553fbb080 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Sat, 27 Jan 2018 03:03:58 -0300
Subject: [PATCH 4/9] - New tokenizer wrap up

---
 python/sparknlp/annotator.py                  |  42 ++++-
 python/test/annotators.py                     |   6 +-
 .../com/johnsnowlabs/nlp/Annotation.scala     |   4 +-
 .../com/johnsnowlabs/nlp/TokenAssembler.scala |   2 +-
 .../nlp/annotators/Lemmatizer.scala           |   2 +-
 .../nlp/annotators/Normalizer.scala           |   2 +-
 .../johnsnowlabs/nlp/annotators/Stemmer.scala |   2 +-
 .../nlp/annotators/Tokenizer.scala            | 145 +++++++++++++-----
 .../annotators/common/DependencyParsed.scala  |   2 +-
 .../nlp/annotators/common/SentenceSplit.scala |   6 +-
 .../nlp/annotators/common/Tagged.scala        |   8 +-
 .../nlp/annotators/common/Tokenized.scala     |   4 +-
 .../pragmatic/PragmaticContentFormatter.scala |   2 +-
 .../spell/norvig/NorvigSweetingModel.scala    |   2 +-
 .../nlp/util/io/ResourceHelper.scala          |   2 +-
 .../nlp/util/regex/RuleFactory.scala          |   2 +-
 .../nlp/AnnotatorBaseTestSpec.scala           |   4 +-
 .../nlp/DocumentAssemblerTestSpec.scala       |   2 +-
 .../nlp/annotators/NormalizerBehaviors.scala  |  11 +-
 .../nlp/annotators/TokenizerBehaviors.scala   |   4 +-
 .../nlp/annotators/TokenizerTestSpec.scala    |  44 +++---
 .../ner/crf/NerCrfApproachSpec.scala          |   2 +-
 .../dep/DependencyParserApproachTest.scala    |   2 +-
 .../GreedyTransitionApproachTest.scala        |   6 +-
 .../PerceptronApproachTestSpec.scala          |   1 -
 25 files changed, 208 insertions(+), 101 deletions(-)

diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index 542e2d6a52f63c..8c1050f10f6989 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -85,18 +85,50 @@ def __init__(self):
 
 class Tokenizer(AnnotatorTransformer):
 
-    pattern = Param(Params._dummy(),
-                    "pattern",
-                    "regular expression pattern for tokenization",
+    targetPattern = Param(Params._dummy(),
+                    "targetPattern",
+                    "pattern to grab from text as token candidates. Defaults \S+",
                     typeConverter=TypeConverters.toString)
 
+    prefixPattern = Param(Params._dummy(),
+                          "prefixPattern",
+                          "regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
+                          typeConverter=TypeConverters.toString)
+
+    suffixPatern = Param(Params._dummy(),
+                          "suffixPatern",
+                          "regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
+                          typeConverter=TypeConverters.toString)
+
+    compositeTokens = Param(Params._dummy(),
+                         "compositeTokens",
+                         "Words that won't be split in two",
+                         typeConverter=TypeConverters.toListString)
+
+    infixPatterns = Param(Params._dummy(),
+                            "infixPatterns",
+                            "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
+                            typeConverter=TypeConverters.toListString)
+
     @keyword_only
     def __init__(self):
         super(Tokenizer, self).__init__()
         self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.Tokenizer", self.uid)
 
-    def setPattern(self, value):
-        return self._set(pattern=value)
+    def setTargetPattern(self, value):
+        return self._set(targetPattern=value)
+
+    def setPrefixPattern(self, value):
+        return self._set(prefixPattern=value)
+
+    def setSuffixPattern(self, value):
+        return self._set(suffixPattern=value)
+
+    def setCompositeTokens(self, value):
+        return self._set(compositeTokens=value)
+
+    def setInfixPatterns(self, value):
+        return self._set(infixPatterns=value)
 
 
 class Stemmer(AnnotatorTransformer):
diff --git a/python/test/annotators.py b/python/test/annotators.py
index b0ec7d74d1b193..21051f72d5dd25 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -17,7 +17,8 @@ def runTest(self):
             .setInputCol("text") \
             .setOutputCol("document")
         tokenizer = Tokenizer()\
-            .setOutputCol("token")
+            .setOutputCol("token") \
+            .setCompositeTokens(["New York"])
         stemmer = Stemmer() \
             .setInputCols(["token"]) \
             .setOutputCol("stem")
@@ -29,7 +30,8 @@ def runTest(self):
             .setOutputCol("assembled")
         finisher = Finisher() \
             .setInputCols(["assembled"]) \
-            .setOutputCols(["reassembled_view"])
+            .setOutputCols(["reassembled_view"]) \
+            .setCleanAnnotations(True)
         assembled = document_assembler.transform(self.data)
         tokenized = tokenizer.transform(assembled)
         stemmed = stemmer.transform(tokenized)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
index d233cd09853292..f5b7a470762e8b 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
@@ -10,11 +10,11 @@ import scala.collection.Map
 /**
   * represents annotator's output parts and their details
   * @param annotatorType the type of annotation
-  * @param begin the index of the first character under this annotation
+  * @param start the index of the first character under this annotation
   * @param end the index after the last character under this annotation
   * @param metadata associated metadata for this annotation
   */
-case class Annotation(annotatorType: String, begin: Int, end: Int, result: String, metadata: Map[String, String])
+case class Annotation(annotatorType: String, start: Int, end: Int, result: String, metadata: Map[String, String])
 
 object Annotation {
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
index a8f9a4ed949f0a..f6011b31ad477f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/TokenAssembler.scala
@@ -21,7 +21,7 @@ class TokenAssembler(override val uid: String) extends AnnotatorModel[TokenAssem
       .map{case (_, sentenceAnnotations) =>
           Annotation(
             DOCUMENT,
-            sentenceAnnotations.minBy(_.begin).begin,
+            sentenceAnnotations.minBy(_.start).start,
             sentenceAnnotations.maxBy(_.end).end,
             sentenceAnnotations.map(_.result).mkString(" "),
             Map.empty[String, String]
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
index 0c1863b0d33fa6..1dd0739bfff007 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Lemmatizer.scala
@@ -78,7 +78,7 @@ class Lemmatizer(override val uid: String) extends AnnotatorModel[Lemmatizer] {
       val token = tokenAnnotation.result
       Annotation(
         annotatorType,
-        tokenAnnotation.begin,
+        tokenAnnotation.start,
         tokenAnnotation.end,
         $$(lemmaDict).getOrElse(token, token),
         tokenAnnotation.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
index bdca08c85a32c8..ca83ae0f6c3d8c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
@@ -44,7 +44,7 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
         .trim
       Annotation(
         annotatorType,
-        token.begin,
+        token.start,
         token.end,
         nToken,
         token.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
index b70e82490e2775..833ca16c5cbb45 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Stemmer.scala
@@ -37,7 +37,7 @@ class Stemmer(override val uid: String) extends AnnotatorModel[Stemmer] {
         val stem = EnglishStemmer.stem(tokenAnnotation.result)
         Annotation(
           annotatorType,
-          tokenAnnotation.begin,
+          tokenAnnotation.start,
           tokenAnnotation.end,
           stem,
           tokenAnnotation.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index f08bdc2ab5f75f..350a877ad0759a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -3,10 +3,10 @@ package com.johnsnowlabs.nlp.annotators
 import com.johnsnowlabs.nlp.annotators.common._
 import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
 import org.apache.spark.ml.param.{Param, StringArrayParam}
-import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
+import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 
-import scala.util.matching.Regex
+import scala.collection.mutable.ArrayBuffer
 
 /**
   * Tokenizes raw text into word pieces, tokens.
@@ -17,10 +17,11 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   import com.johnsnowlabs.nlp.AnnotatorType._
 
-  val wordPattern: Param[String] = new Param(this, "wordPattern", "this is the base word pattern. Defaults \\w+")
-  val extensionPattern: StringArrayParam = new StringArrayParam(this, "infixPattern", "infix patterns allow for word exceptions that count as single token. E.g. U.S.A. Defaults ")
-  val prefixPattern: StringArrayParam = new StringArrayParam(this, "prefixPattern", "this is the token pattern")
-  val suffixPattern: StringArrayParam = new StringArrayParam(this, "suffixPattern", "this is the token pattern")
+  val compositeTokens: StringArrayParam = new StringArrayParam(this, "compositeTokens", "Words that won't be split in two")
+  val targetPattern: Param[String] = new Param(this, "targetPattern", "pattern to grab from text as token candidates. Defaults \\S+")
+  val infixPatterns: StringArrayParam = new StringArrayParam(this, "infixPattern", "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults")
+  val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\w\\$\\.]*)")
+  val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\w]?)([^\\s\\w]*)\\z")
 
   override val annotatorType: AnnotatorType = TOKEN
 
@@ -29,59 +30,125 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   def this() = this(Identifiable.randomUID("REGEX_TOKENIZER"))
 
-  def setWordPattern(value: String): this.type = set(wordPattern, value)
+  def setTargetPattern(value: String): this.type = set(targetPattern, value)
 
-  def setExtensionPattern(value: Array[String]): this.type = set(extensionPattern, value)
+  def setExtensionPatterns(value: Array[String]): this.type = set(infixPatterns, value)
 
-  def addExtensionPattern(value: String): this.type = set(extensionPattern, $(extensionPattern) :+ value)
+  def addInfixPattern(value: String): this.type = set(infixPatterns, value +: $(infixPatterns))
 
-  def setPrefixPattern(value: Array[String]): this.type = set(prefixPattern, value)
+  def setPrefixPattern(value: String): this.type = set(prefixPattern, value)
 
-  def addPrefixPattern(value: String): this.type = set(prefixPattern, $(prefixPattern) :+ value)
+  def setSuffixPattern(value: String): this.type = set(suffixPattern, value)
 
-  def setSuffixPattern(value: Array[String]): this.type = set(suffixPattern, value)
+  def setCompositeTokens(value: Array[String]): this.type = set(compositeTokens, value)
 
-  def addSuffixPattern(value: String): this.type = set(suffixPattern, $(suffixPattern) :+ value)
+  def getCompositeTokens: Array[String] = $(compositeTokens)
 
-  def getWordPattern: String = $(wordPattern)
+  def getInfixPatterns: Array[String] = $(infixPatterns)
 
-  def getInfixPattern: Array[String] = $(extensionPattern)
+  def getPrefixPattern: String = $(prefixPattern)
 
-  def getPrefixPattern: Array[String] = $(prefixPattern)
+  def getSuffixPattern: String = $(suffixPattern)
 
-  def getSuffixPattern: Array[String] = $(suffixPattern)
+  def getTargetPattern: String = $(targetPattern)
 
   setDefault(inputCols, Array(DOCUMENT))
 
-  setDefault(wordPattern, "\\w+")
-  setDefault(extensionPattern, Array("\\.(?:\\w{1}\\.)+|(?:\\-\\w+)*"))
-  setDefault(prefixPattern, Array("([^\\s\\w]?)"))
-  setDefault(suffixPattern, Array("([^\\s\\w]?)([^\\s\\w]*)"))
+  lazy private val ruleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
 
-  val ruleFactory = new RuleFactory(MatchStrategy.MATCH_ALL)
-
-  override def beforeAnnotate(): Unit = {
-    /** Clears out rules and constructs a new rule for every combination of rules provided */
-    /** The strategy is to catch one token per regex group */
-    /** User may add its own groups if needs targets to be tokenized separately from the rest */
-    /** "([^\s\w]?)(\w+(?:\.(?:\w{1}\.)+|(?:\-\w+)*)?)([^\s\w]?)([\s\w]*)" */
-    /** */
+  /** Clears out rules and constructs a new rule for every combination of rules provided */
+  /** The strategy is to catch one token per regex group */
+  /** User may add its own groups if needs targets to be tokenized separately from the rest */
+  protected def setFactoryRules(): Unit = {
     ruleFactory
       .clearRules()
-    $(prefixPattern).foreach(pp => $(suffixPattern).foreach (sp => $(extensionPattern).foreach(ep => {
-      ruleFactory.addRule(
-        (pp + "(" + $(wordPattern) + "(?:" + ep + ")?" + ")" + sp).r,
-        "tokenizer construction pattern"
-      )
-    })))
+    val rules = ArrayBuffer.empty[String]
+    require($(infixPatterns).nonEmpty)
+    require($(infixPatterns).forall(ip => ip.contains("(") && ip.contains(")")),
+      "infix patterns must use regex group. Notice each group will result in separate token")
+    $(infixPatterns).foreach(ip => {
+      val rule = new StringBuilder
+      get(prefixPattern).orElse(getDefault(prefixPattern)).foreach(pp => {
+        require(pp.startsWith("\\A"), "prefixPattern must begin with \\A to ensure it is the beginning of the string")
+        require(pp.contains("(") && pp.contains(")"), "prefixPattern must contain regex groups. Each group will return in separate token")
+        rule.append(pp)
+      })
+      rule.append(ip)
+      get(suffixPattern).orElse(getDefault(suffixPattern)).foreach(sp => {
+        require(sp.endsWith("\\z"), "suffixPattern must end with \\z to ensure it is the end of the string")
+        require(sp.contains("(") && sp.contains(")"), "suffixPattern must contain regex groups. Each group will return in separate token")
+        rule.append(sp)
+      })
+      rules.append(rule.toString)
+    })
+    rules.foreach(rule => ruleFactory.addRule(rule.r, rule))
   }
 
+  /** Check here for explanation on this default pattern */
+  setDefault(infixPatterns, Array(
+    "((?:\\w+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
+    "(\\w+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
+    "(\\w+)('{1}\\w+)", // http://rubular.com/r/N84PYwYjQp
+    "((?:\\w+[^\\s\\w]{1})+\\w+)", // http://rubular.com/r/wOvQcey9e3
+    "(\\w+)" // basic word token
+  ))
+  /** These catch everything before and after a word, as a separate token*/
+  setDefault(prefixPattern, "\\A([^\\s\\w\\$\\.]*)")
+  setDefault(suffixPattern, "([^\\s\\w]?)([^\\s\\w]*)\\z")
+  setDefault(targetPattern, "\\S+")
+
+  setFactoryRules()
+
+  override def beforeAnnotate(): Unit = {
+    setFactoryRules()
+  }
+
+  private val PROTECT_STR = "ↈ"
+
   def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
     sentences.map{text =>
-      val tokens = ruleFactory.findMatch(text.content).flatMap { m =>
-        (1 to m.content.groupCount)
-          .map (i => IndexedToken(m.content.group(i), text.begin + m.content.start, text.begin + m.content.end - 1))
-      }.filter(t => t.token.nonEmpty).toArray
+      /** Step 1, protect exception words from being broken*/
+      var protected_text = text.content
+      if (get(compositeTokens).isDefined) {
+        $(compositeTokens).foreach(tokenException =>
+          protected_text = protected_text.replaceAll(
+            tokenException,
+            tokenException.replaceAll("[^(?:" + $(targetPattern) + ")]", PROTECT_STR)
+          )
+        )
+      }
+      /** Step 2, Return protected exception tokens back into text and move on*/
+      val tokens = $(targetPattern).r.findAllMatchIn(protected_text).flatMap { candidate =>
+        if (get(compositeTokens).isDefined && candidate.matched.contains(PROTECT_STR)) {
+          /** Put back character and move on */
+          Seq(IndexedToken(
+            text.content.slice(text.start + candidate.start, text.start + candidate.end),
+            text.start + candidate.start,
+            text.start + candidate.end - 1
+          ))
+        }
+        else {
+        /** Step 3, If no exception found, find candidates through the possible general rule patterns*/
+        ruleFactory.findMatchFirstOnly(candidate.matched).map {m =>
+          var curPos = m.content.start
+          (1 to m.content.groupCount)
+            .map (i => {
+              val target = m.content.group(i)
+              val it = IndexedToken(
+                target,
+                text.start + candidate.start + curPos,
+                text.start + candidate.start + curPos + target.length - 1
+              )
+              curPos += target.length
+              it
+            })
+          /** Step 4, If rules didn't match, return whatever candidate we have and leave it as is*/
+          }.getOrElse(Seq(IndexedToken(
+            candidate.matched,
+            text.start + candidate.start,
+            text.start + candidate.end - 1
+        )))
+      }}.toArray.filter(t => t.token.nonEmpty)
       TokenizedSentence(tokens)
     }
   }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
index 2df2d514070227..3beb73cb97e217 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DependencyParsed.scala
@@ -15,7 +15,7 @@ object DependencyParsed extends Annotated[DependencyParsedSentence]{
     val sentences = Tokenized.unpack(annotations)
     val depAnnotations = annotations
       .filter(a => a.annotatorType == annotatorType)
-      .sortBy(a => a.begin)
+      .sortBy(a => a.start)
 
     var last = 0
     sentences.map{sentence =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
index 0a2d16b5aaa7b9..badb8644ffb316 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceSplit.scala
@@ -5,7 +5,7 @@ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
 /**
   * structure representing a sentence and its boundaries
   */
-case class Sentence(content: String, begin: Int, end: Int)
+case class Sentence(content: String, start: Int, end: Int)
 
 object Sentence {
   def fromTexts(texts: String*): Seq[Sentence] = {
@@ -27,11 +27,11 @@ object SentenceSplit extends Annotated[Sentence] {
   override def unpack(annotations: Seq[Annotation]): Seq[Sentence] = {
     annotations.filter(_.annotatorType == annotatorType)
       .map(annotation =>
-        Sentence(annotation.result, annotation.begin, annotation.end)
+        Sentence(annotation.result, annotation.start, annotation.end)
       )
   }
 
   override def pack(items: Seq[Sentence]): Seq[Annotation] = {
-    items.map(item => Annotation(annotatorType, item.begin, item.end, item.content, Map.empty[String, String]))
+    items.map(item => Annotation(annotatorType, item.start, item.end, item.content, Map.empty[String, String]))
   }
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
index b64c5fb33bfc81..629a8099197544 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala
@@ -15,17 +15,17 @@ trait Tagged[T >: TaggedSentence <: TaggedSentence] extends Annotated[T] {
     val tokenized = Tokenized.unpack(annotations)
     val tagAnnotations = annotations
       .filter(a => a.annotatorType == annotatorType)
-      .sortBy(a => a.begin)
+      .sortBy(a => a.start)
       .toIterator
 
     var annotation: Option[Annotation] = None
 
     tokenized.map { sentence =>
       val tokens = sentence.indexedTokens.map { token =>
-        while (tagAnnotations.hasNext && (annotation.isEmpty || annotation.get.begin < token.begin))
+        while (tagAnnotations.hasNext && (annotation.isEmpty || annotation.get.start < token.begin))
           annotation = Some(tagAnnotations.next)
 
-        val tag = if (annotation.isDefined && annotation.get.begin == token.begin)
+        val tag = if (annotation.isDefined && annotation.get.start == token.begin)
           annotation.get.result
         else
           emptyTag
@@ -69,7 +69,7 @@ trait Tagged[T >: TaggedSentence <: TaggedSentence] extends Annotated[T] {
   }
 
   protected def getLabels(sentences: Seq[TaggedSentence], labelAnnotations: Seq[Annotation]): Seq[TextSentenceLabels] = {
-    val position2Tag = labelAnnotations.map(a => (a.begin, a.end) -> a.result).toMap
+    val position2Tag = labelAnnotations.map(a => (a.start, a.end) -> a.result).toMap
 
     sentences.map{sentence =>
       val labels = sentence.indexedTaggedWords.map { w =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
index e02bec3e79e339..c1128c5be7c976 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tokenized.scala
@@ -13,8 +13,8 @@ object Tokenized extends Annotated[TokenizedSentence] {
 
     SentenceSplit.unpack(annotations).map(sentence => {
       tokens.filter(token =>
-        token.begin >= sentence.begin & token.end <= sentence.end
-      ).map(token => IndexedToken(token.result, token.begin, token.end))
+        token.start >= sentence.start & token.end <= sentence.end
+      ).map(token => IndexedToken(token.result, token.start, token.end))
     }).filter(_.nonEmpty).map(indexedTokens => TokenizedSentence(indexedTokens))
 
   }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
index ca0432788c411b..d78b133a18cd89 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala
@@ -199,7 +199,7 @@ class PragmaticContentFormatter(text: String) {
     val factory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
     // http://rubular.com/r/G2opjedIm9
       //special periods
-      .addRule(new RegexRule("http://rubular.com/r/G2opjedIm9", "formatGeo"))
+      .addRule(new RegexRule("(?<=[a-zA-z]°)\\.(?=\\s*\\d+)", "formatGeo"))
 
     wip = factory.transformWithSymbol(MULT_PERIOD, wip)
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
index e12e0899eedb64..2f3814196d63e7 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingModel.scala
@@ -237,7 +237,7 @@ class NorvigSweetingModel(override val uid: String) extends AnnotatorModel[Norvi
     annotations.map { token =>
         Annotation(
           annotatorType,
-          token.begin,
+          token.start,
           token.end,
           check(token.result),
           token.metadata
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
index a3d4837011d899..26eb40187b135a 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala
@@ -370,7 +370,7 @@ object ResourceHelper {
         val tokenizer = new Tokenizer()
           .setInputCols("document")
           .setOutputCol("token")
-          .setWordPattern(tokenPattern)
+          .setTargetPattern(tokenPattern)
         val normalizer = new Normalizer()
           .setInputCols("token")
           .setOutputCol("normal")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
index f7a544d9b1d650..70c2ac5cafbcda 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
@@ -69,7 +69,7 @@ class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
     case _ => throw new IllegalArgumentException("Invalid match strategy")
   }
 
-  private val transformWithSymbolFunc = (text: String, symbol: String) => transformStrategy match {
+  private val transformWithSymbolFunc = (symbol: String, text: String) => transformStrategy match {
     case APPEND_WITH_SYMBOL => rules.foldRight(text)((rule, target) => transformMatch(target, rule.regex)({ m =>
       logger.debug("Matched: {} from: {} using rule {} with strategy {}",
         () => m.matched,
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
index 30fc4466f57010..03167277af7ba1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBaseTestSpec.scala
@@ -103,13 +103,13 @@ class AnnotatorBaseTestSpec extends FlatSpec {
     val contentMeta = result.select("demand", "result").take(1).head.getSeq[Row](0)
     val contentAnnotation = contentMeta.map(Annotation(_)).head
     assert(contentAnnotation.annotatorType == dummyAnnotator.annotatorType)
-    assert(contentAnnotation.begin == 0)
+    assert(contentAnnotation.start == 0)
     assert(contentAnnotation.end == 25)
     assert(contentAnnotation.metadata.contains("a") && contentAnnotation.metadata("a") == "b")
     val demandContentMeta = result.select("demand", "result").take(1).head.getSeq[Row](1)
     val demandContentAnnotation = demandContentMeta.map(Annotation(_)).head
     assert(demandContentAnnotation.annotatorType == demandingDummyAnnotator.annotatorType)
-    assert(demandContentAnnotation.begin == 11)
+    assert(demandContentAnnotation.start == 11)
     assert(demandContentAnnotation.end == 18)
     assert(demandContentAnnotation.metadata.contains("aa") && demandContentAnnotation.metadata("aa") == "bb")
   }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
index 448fae3b41de92..326fed4adc2ae1 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/DocumentAssemblerTestSpec.scala
@@ -18,7 +18,7 @@ class DocumentAssemblerTestSpec extends FlatSpec {
 
   "A DocumentAssembler" should "annotate with the correct indexes" in {
     val f = fixture
-    f.text.head should equal (f.text(f.assembledDoc.head.begin))
+    f.text.head should equal (f.text(f.assembledDoc.head.start))
     f.text.last should equal (f.text(f.assembledDoc.head.end))
   }
 }
\ No newline at end of file
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
index 0572a63009172e..57f7ec8d50024f 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/NormalizerBehaviors.scala
@@ -27,17 +27,16 @@ trait NormalizerBehaviors { this: FlatSpec =>
     AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
       .collect().foreach {
       row =>
-        val tokens = row.getSeq[Row](3).map(Annotation(_))
+        val tokens = row.getSeq[Row](3).map(Annotation(_)).filterNot(a => a.result == "." || a.result == ",")
         val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
         normalizedAnnotations.foreach {
-          case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
-            assert(stem.result.nonEmpty, "Annotation result exists")
+          case nToken: Annotation if nToken.annotatorType == AnnotatorType.TOKEN =>
+            assert(nToken.result.nonEmpty, "Annotation result exists")
           case _ =>
         }
-
         normalizedAnnotations.zip(tokens).foreach {
-          case (stem: Annotation, token: Annotation) =>
-            assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
+          case (nToken: Annotation, token: Annotation) =>
+            assert(nToken.result == token.result.replaceAll("[^a-zA-Z]", ""))
         }
       }
     }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
index bdb47d64f953a3..415bf09e92e371 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerBehaviors.scala
@@ -51,8 +51,8 @@ trait TokenizerBehaviors { this: FlatSpec =>
       val f = fixture(dataset)
       f.tokensAnnotations.foreach { a =>
         val token = a.result
-        val sentenceToken = f.corpus.slice(a.begin, a.end + 1)
-        assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.begin},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
+        val sentenceToken = f.corpus.slice(a.start, a.end + 1)
+        assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.start},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
       }
     }
   }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 3f692ee79517d8..688c420a1f7ad5 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -13,44 +13,52 @@ import org.apache.spark.ml.Pipeline
   */
 class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
 
+  import SparkAccessor.spark.implicits._
+
   val regexTokenizer = new Tokenizer
 
   "a Tokenizer" should s"be of type ${AnnotatorType.TOKEN}" in {
     assert(regexTokenizer.annotatorType == AnnotatorType.TOKEN)
   }
 
-  "a Tokenizer" should "correctly tokenize target text on its defaults parameters" in {
-    val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
-    import data.sparkSession.implicits._
+
+  val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
+  val expected = Array(
+    "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+    "Give", "me", "my", "horse", "!", "or", "$100", "bucks", "'", "He", "said", "'", ",", "I", "'ll", "defeat", "markus-crassus", "."
+  )
+
+  "a Tokenizer" should "correctly tokenize target text on its defaults parameters with exceptions" in {
+    val data = DataBuilder.basicDataBuild(targetText)
     val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
-    val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token")
-    val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
+    val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York", "won't"))
+    val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setCleanAnnotations(false).setOutputCols("output")
     val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
-    val expected = Array(
-      "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
-      "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
-    )
-    val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
+    val pip = pipeline.fit(data).transform(data)
+    val result = pip
+      .select("output").as[Array[String]]
       .collect.flatten
     assert(
       result.sameElements(expected),
       s"because result tokens differ: " +
         s"\nresult was \n${result.mkString("|")} \nexpected is: \n${expected.mkString("|")}"
     )
+    pip
+      .select("token").as[Array[Annotation]]
+      .collect.foreach(annotations => {
+      annotations.foreach(annotation => {
+        assert(targetText.slice(annotation.start, annotation.end + 1) == annotation.result)
+      })
+    })
   }
 
-  "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters" in {
-    val data = DataBuilder.basicDataBuild("Hello, I am from the U.S.A. (and you know it). Give me my horse! 'He said', I'll defeat markus-crassus.")
-    import data.sparkSession.implicits._
+  "a Tokenizer" should "correctly tokenize target sentences on its defaults parameters with exceptions" in {
+    val data = DataBuilder.basicDataBuild(targetText)
     val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
     val sentence = new SentenceDetector().setInputCols("document").setOutputCol("sentence")
-    val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token")
+    val tokenizer = new Tokenizer().setInputCols("sentence").setOutputCol("token").setCompositeTokens(Array("New York"))
     val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setOutputCols("output")
     val pipeline = new Pipeline().setStages(Array(document, sentence, tokenizer, finisher))
-    val expected = Array(
-      "Hello", ",", "I", "am", "from", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
-      "Give", "me", "my", "horse", "!", "'", "He", "said", "'", ",", "I", "'", "ll", "defeat", "markus-crassus", "."
-    )
     val result = pipeline.fit(data).transform(data).select("output").as[Array[String]]
       .collect.flatten
     assert(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
index 327f84e7d80738..5ffcd439c6bb51 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
@@ -35,7 +35,7 @@ class NerCrfApproachSpec extends FlatSpec {
 
     assert(annotations.length == labels.length)
     for ((annotation, label) <- annotations.zip(labels)) {
-      assert(annotation.begin == label.begin)
+      assert(annotation.start == label.start)
       assert(annotation.end == label.end)
       assert(annotation.annotatorType == AnnotatorType.NAMED_ENTITY)
       assert(annotation.result == label.result)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
index c76e272bf94e5e..e09a7dddcb2bea 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTest.scala
@@ -52,6 +52,6 @@ class DependencyParserApproachTest extends FlatSpec {
     val f = fixture
     f.depAnnotations
       .zip(f.tokenAnnotations)
-      .foreach { case (dep, token) => assert(dep.begin == token.begin && dep.end == token.end, s"Token and word should have equal indixes") }
+      .foreach { case (dep, token) => assert(dep.start == token.start && dep.end == token.end, s"Token and word should have equal indixes") }
   }
 }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
index 7a92c52cca9ede..306c1de43d0743 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproachTest.scala
@@ -14,15 +14,15 @@ class GreedyTransitionApproachTest extends FlatSpec {
 
     val tokenAnnotations = Annotation.collect(df, "token")
       .flatten
-      .sortBy { _.begin }
+      .sortBy { _.start }
 
     val posTagAnnotations = Annotation.collect(df, "pos")
       .flatten
-      .sortBy { _.begin }
+      .sortBy { _.start }
 
     val sentenceAnnotation = Annotation.collect(df, "sentence")
       .flatten
-      .sortBy { _.begin }
+      .sortBy { _.start }
 
   }
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
index 51ea37f3235051..80beba5221be73 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproachTestSpec.scala
@@ -30,7 +30,6 @@ class PerceptronApproachTestSpec extends FlatSpec with PerceptronApproachBehavio
       length += text.length + 1
       sentence
     }
-
     new Tokenizer().tag(sentences).toArray
   }
 

From b302ad518980884c24b66146be8f119329fdb1c5 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Sat, 27 Jan 2018 03:13:07 -0300
Subject: [PATCH 5/9] - Fixed Annotation field name

---
 src/main/scala/com/johnsnowlabs/nlp/Annotation.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
index f5b7a470762e8b..cf5d925b5081da 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/Annotation.scala
@@ -39,7 +39,7 @@ object Annotation {
   /** This is spark type of an annotation representing its metadata shape */
   val dataType = new StructType(Array(
     StructField("annotatorType", StringType, nullable = true),
-    StructField("begin", IntegerType, nullable = false),
+    StructField("start", IntegerType, nullable = false),
     StructField("end", IntegerType, nullable = false),
     StructField("result", StringType, nullable = true),
     StructField("metadata", MapType(StringType, StringType), nullable = true)

From a68f5a68d1ab011e90901dc7c1f2239a1ff2be52 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Sat, 27 Jan 2018 15:51:19 -0300
Subject: [PATCH 6/9] - Features now truly lazy - Removed beforeAnnotate, back
 to lazy

---
 .../com/johnsnowlabs/nlp/AnnotatorModel.scala |  4 --
 .../com/johnsnowlabs/nlp/HasFeatures.scala    |  6 +--
 .../nlp/ParamsAndFeaturesWritable.scala       |  2 +-
 .../nlp/annotators/Tokenizer.scala            | 14 +------
 .../nlp/serialization/Feature.scala           | 38 ++++++++++++++++---
 .../nlp/annotators/TokenizerTestSpec.scala    |  2 +-
 .../ner/crf/NerCrfApproachSpec.scala          |  8 ++--
 7 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
index 4fca2ce01d9a52..d3ed36a2daff82 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
@@ -53,9 +53,6 @@ abstract class AnnotatorModel[M <: Model[M]]
     StructType(outputFields)
   }
 
-  /** override this function if you need to reset or clear annotate variables just once before annotating */
-  def beforeAnnotate(): Unit = {}
-
   /**
     * Given requirements are met, this applies ML transformation within a Pipeline or stand-alone
     * Output annotation will be generated as a new column, previous annotations are still available separately
@@ -68,7 +65,6 @@ abstract class AnnotatorModel[M <: Model[M]]
       s"${requiredAnnotatorTypes.mkString(", ")}")
     val metadataBuilder: MetadataBuilder = new MetadataBuilder()
     metadataBuilder.putString("annotatorType", annotatorType)
-    beforeAnnotate()
     dataset.withColumn(
       getOutputCol,
       dfAnnotate(
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala b/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
index d713366ae16eac..ab6c6035046a51 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasFeatures.scala
@@ -26,10 +26,10 @@ trait HasFeatures {
 
   protected def get[T](feature: StructFeature[T]): Option[T] = feature.get
 
-  protected def $$[T](feature: ArrayFeature[T]): Array[T] = feature.getValue
+  protected def $$[T](feature: ArrayFeature[T]): Array[T] = feature.getOrDefault
 
-  protected def $$[K, V](feature: MapFeature[K, V]): Map[K, V] = feature.getValue
+  protected def $$[K, V](feature: MapFeature[K, V]): Map[K, V] = feature.getOrDefault
 
-  protected def $$[T](feature: StructFeature[T]): T = feature.getValue
+  protected def $$[T](feature: StructFeature[T]): T = feature.getOrDefault
 
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
index aac623b487c02d..9e71fc2884e9dd 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala
@@ -11,7 +11,7 @@ class FeaturesWriter[T](annotatorWithFeatures: HasFeatures, baseWriter: MLWriter
     baseWriter.save(path)
 
     for (feature <- annotatorWithFeatures.features) {
-      feature.serializeInfer(sparkSession, path, feature.name, feature.getValue)
+      feature.serializeInfer(sparkSession, path, feature.name, feature.getOrDefault)
     }
 
     onWritten(path, sparkSession)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index 350a877ad0759a..efcad6d59373a0 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -54,14 +54,10 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   setDefault(inputCols, Array(DOCUMENT))
 
-  lazy private val ruleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
-
   /** Clears out rules and constructs a new rule for every combination of rules provided */
   /** The strategy is to catch one token per regex group */
   /** User may add its own groups if needs targets to be tokenized separately from the rest */
-  protected def setFactoryRules(): Unit = {
-    ruleFactory
-      .clearRules()
+  lazy private val ruleFactory = {
     val rules = ArrayBuffer.empty[String]
     require($(infixPatterns).nonEmpty)
     require($(infixPatterns).forall(ip => ip.contains("(") && ip.contains(")")),
@@ -81,7 +77,7 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
       })
       rules.append(rule.toString)
     })
-    rules.foreach(rule => ruleFactory.addRule(rule.r, rule))
+    rules.foldLeft(new RuleFactory(MatchStrategy.MATCH_FIRST))((factory, rule) => factory.addRule(rule.r, rule))
   }
 
   /** Check here for explanation on this default pattern */
@@ -97,12 +93,6 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
   setDefault(suffixPattern, "([^\\s\\w]?)([^\\s\\w]*)\\z")
   setDefault(targetPattern, "\\S+")
 
-  setFactoryRules()
-
-  override def beforeAnnotate(): Unit = {
-    setFactoryRules()
-  }
-
   private val PROTECT_STR = "ↈ"
 
   def tag(sentences: Seq[Sentence]): Seq[TokenizedSentence] = {
diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index f3a251f533423b..3839970bda238d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -2,23 +2,29 @@ package com.johnsnowlabs.nlp.serialization
 
 import com.johnsnowlabs.nlp.HasFeatures
 import com.johnsnowlabs.nlp.util.ConfigHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 
 import scala.reflect.ClassTag
 
-abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model: HasFeatures, val name: String)(implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate()) extends Serializable {
+abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model: HasFeatures, val name: String) extends Serializable {
   model.features.append(this)
 
   private val config = ConfigHelper.retrieve
+  private val spark = ResourceHelper.spark
 
   val serializationMode: String = config.getString("performance.serialization")
   val useBroadcast: Boolean = config.getBoolean("performance.useBroadcast")
 
   final protected var broadcastValue: Option[Broadcast[TComplete]] = None
+  final protected var fallbackBroadcastValue: Option[Broadcast[TComplete]] = None
+
   final protected var rawValue: Option[TComplete] = None
-  final protected var fallback: Option[() => TComplete] = None
+  final protected var fallbackRawValue: Option[TComplete] = None
+
+  final protected var fallbackLazyValue: Option[() => TComplete] = None
 
   final def serialize(spark: SparkSession, path: String, field: String, value: TComplete): Unit = {
     serializationMode match {
@@ -52,18 +58,38 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
   final protected def getFieldPath(path: String, field: String): Path =
     Path.mergePaths(new Path(path), new Path("/fields/" + field))
 
+  private def callAndSetFallback: Option[TComplete] = {
+    if (useBroadcast) {
+      fallbackBroadcastValue = Some(spark.sparkContext.broadcast[TComplete](fallbackLazyValue.get.asInstanceOf[TComplete]))
+      fallbackBroadcastValue.map(_.value)
+    } else {
+      fallbackRawValue = fallbackLazyValue.map(_())
+      fallbackRawValue
+    }
+  }
+
   final def get: Option[TComplete] = {
     broadcastValue.map(_.value).orElse(rawValue)
   }
 
-  final def getValue: TComplete = {
-    broadcastValue.map(_.value).orElse(rawValue).orElse(fallback.map(_())).getOrElse(throw new Exception(s"feature $name is not set"))
+  final def getOrDefault: TComplete = {
+    if (useBroadcast) {
+      broadcastValue.map(_.value)
+        .orElse(fallbackBroadcastValue.map(_.value))
+        .orElse(callAndSetFallback)
+        .getOrElse(throw new Exception(s"feature $name is not set"))
+    } else {
+      rawValue
+        .orElse(fallbackRawValue)
+        .orElse(callAndSetFallback)
+        .getOrElse(throw new Exception(s"feature $name is not set"))
+    }
   }
 
   final def setValue(v: Option[Any]): HasFeatures = {
     if (useBroadcast) {
       if (isSet) broadcastValue.get.destroy()
-      broadcastValue = Some(sparkSession.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
+      broadcastValue = Some(spark.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
     } else {
       rawValue = Some(v.get.asInstanceOf[TComplete])
     }
@@ -71,7 +97,7 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
   }
 
   def setFallback(v: Option[() => TComplete]): HasFeatures = {
-    fallback = v
+    fallbackLazyValue = v
     model
   }
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 688c420a1f7ad5..001dca7d8eaf99 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -31,7 +31,7 @@ class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
   "a Tokenizer" should "correctly tokenize target text on its defaults parameters with exceptions" in {
     val data = DataBuilder.basicDataBuild(targetText)
     val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
-    val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York", "won't"))
+    val tokenizer = new Tokenizer().setInputCols("document").setOutputCol("token").setCompositeTokens(Array("New York"))
     val finisher = new Finisher().setInputCols("token").setOutputAsArray(true).setCleanAnnotations(false).setOutputCols("output")
     val pipeline = new Pipeline().setStages(Array(document, tokenizer, finisher))
     val pip = pipeline.fit(data).transform(data)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
index 5ffcd439c6bb51..b72dacc0045642 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala
@@ -16,14 +16,14 @@ class NerCrfApproachSpec extends FlatSpec {
     nerModel.write.overwrite.save("./test_crf_pipeline")
     val loadedNer = NerCrfModel.read.load("./test_crf_pipeline")
 
-    assert(nerModel.model.getValue.serialize == loadedNer.model.getValue.serialize)
-    assert(nerModel.dictionaryFeatures.getValue == loadedNer.dictionaryFeatures.getValue)
+    assert(nerModel.model.getOrDefault.serialize == loadedNer.model.getOrDefault.serialize)
+    assert(nerModel.dictionaryFeatures.getOrDefault == loadedNer.dictionaryFeatures.getOrDefault)
   }
 
 
   "NerCrfApproach" should "have correct set of labels" in {
     assert(nerModel.model.isSet)
-    val metadata = nerModel.model.getValue.metadata
+    val metadata = nerModel.model.getOrDefault.metadata
     assert(metadata.labels.toSeq == Seq("@#Start", "PER", "O", "ORG", "LOC"))
   }
 
@@ -65,7 +65,7 @@ class NerCrfApproachSpec extends FlatSpec {
   "NerCrfModel" should "correctly handle entities param" in {
     val restrictedModel = new NerCrfModel()
       .setEntities(Array("PER", "LOC"))
-      .setModel(nerModel.model.getValue)
+      .setModel(nerModel.model.getOrDefault)
       .setOutputCol(nerModel.getOutputCol)
       .setInputCols(nerModel.getInputCols)
 

From a234dd6030480f28fc5717811386afb67fbb319d Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Sat, 27 Jan 2018 16:30:30 -0300
Subject: [PATCH 7/9] - Fixed bad lazy call

---
 src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index 3839970bda238d..989318c56bf23c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -60,7 +60,7 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
 
   private def callAndSetFallback: Option[TComplete] = {
     if (useBroadcast) {
-      fallbackBroadcastValue = Some(spark.sparkContext.broadcast[TComplete](fallbackLazyValue.get.asInstanceOf[TComplete]))
+      fallbackBroadcastValue = fallbackLazyValue.map(v => spark.sparkContext.broadcast[TComplete](v()))
       fallbackBroadcastValue.map(_.value)
     } else {
       fallbackRawValue = fallbackLazyValue.map(_())

From 562e254ceae92c34173f08cdc788976c31f5d7fc Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Sat, 27 Jan 2018 17:16:05 -0300
Subject: [PATCH 8/9] - Fixed NullPointerException. Fallback Default Feature
 value may not be broadcast

---
 .../nlp/serialization/Feature.scala           | 32 ++++++-------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
index 989318c56bf23c..404e3a948bb86d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala
@@ -19,7 +19,6 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
   val useBroadcast: Boolean = config.getBoolean("performance.useBroadcast")
 
   final protected var broadcastValue: Option[Broadcast[TComplete]] = None
-  final protected var fallbackBroadcastValue: Option[Broadcast[TComplete]] = None
 
   final protected var rawValue: Option[TComplete] = None
   final protected var fallbackRawValue: Option[TComplete] = None
@@ -59,13 +58,8 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
     Path.mergePaths(new Path(path), new Path("/fields/" + field))
 
   private def callAndSetFallback: Option[TComplete] = {
-    if (useBroadcast) {
-      fallbackBroadcastValue = fallbackLazyValue.map(v => spark.sparkContext.broadcast[TComplete](v()))
-      fallbackBroadcastValue.map(_.value)
-    } else {
-      fallbackRawValue = fallbackLazyValue.map(_())
-      fallbackRawValue
-    }
+    fallbackRawValue = fallbackLazyValue.map(_())
+    fallbackRawValue
   }
 
   final def get: Option[TComplete] = {
@@ -73,25 +67,19 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag](model:
   }
 
   final def getOrDefault: TComplete = {
-    if (useBroadcast) {
-      broadcastValue.map(_.value)
-        .orElse(fallbackBroadcastValue.map(_.value))
-        .orElse(callAndSetFallback)
-        .getOrElse(throw new Exception(s"feature $name is not set"))
-    } else {
-      rawValue
-        .orElse(fallbackRawValue)
-        .orElse(callAndSetFallback)
-        .getOrElse(throw new Exception(s"feature $name is not set"))
-    }
+    broadcastValue.map(_.value)
+      .orElse(rawValue)
+      .orElse(fallbackRawValue)
+      .orElse(callAndSetFallback)
+      .getOrElse(throw new Exception(s"feature $name is not set"))
   }
 
-  final def setValue(v: Option[Any]): HasFeatures = {
+  final def setValue(value: Option[Any]): HasFeatures = {
     if (useBroadcast) {
       if (isSet) broadcastValue.get.destroy()
-      broadcastValue = Some(spark.sparkContext.broadcast[TComplete](v.get.asInstanceOf[TComplete]))
+      broadcastValue = value.map(v => spark.sparkContext.broadcast[TComplete](v.asInstanceOf[TComplete]))
     } else {
-      rawValue = Some(v.get.asInstanceOf[TComplete])
+      rawValue = Some(value.get.asInstanceOf[TComplete])
     }
     model
   }

From 892f782a37feef7e0b84ff60650252a98ce7b250 Mon Sep 17 00:00:00 2001
From: Saif Addin <saif@johnsnowlabs.com>
Date: Sat, 27 Jan 2018 17:50:02 -0300
Subject: [PATCH 9/9] - language agnostic tokenizer defaults

---
 .../nlp/annotators/Tokenizer.scala             | 18 +++++++++---------
 .../nlp/annotators/TokenizerTestSpec.scala     |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index efcad6d59373a0..4f4e63a65302b0 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -20,8 +20,8 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
   val compositeTokens: StringArrayParam = new StringArrayParam(this, "compositeTokens", "Words that won't be split in two")
   val targetPattern: Param[String] = new Param(this, "targetPattern", "pattern to grab from text as token candidates. Defaults \\S+")
   val infixPatterns: StringArrayParam = new StringArrayParam(this, "infixPattern", "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults")
-  val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\w\\$\\.]*)")
-  val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\w]?)([^\\s\\w]*)\\z")
+  val prefixPattern: Param[String] = new Param[String](this, "prefixPattern", "regex with groups and begins with \\A to match target prefix. Defaults to \\A([^\\s\\p{L}$\\.]*)")
+  val suffixPattern: Param[String] = new Param[String](this, "suffixPattern", "regex with groups and ends with \\z to match target suffix. Defaults to ([^\\s\\p{L}]?)([^\\s\\p{L}]*)\\z")
 
   override val annotatorType: AnnotatorType = TOKEN
 
@@ -82,15 +82,15 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
 
   /** Check here for explanation on this default pattern */
   setDefault(infixPatterns, Array(
-    "((?:\\w+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
-    "(\\w+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
-    "(\\w+)('{1}\\w+)", // http://rubular.com/r/N84PYwYjQp
-    "((?:\\w+[^\\s\\w]{1})+\\w+)", // http://rubular.com/r/wOvQcey9e3
-    "(\\w+)" // basic word token
+    "((?:\\p{L}+\\.)+)", // http://rubular.com/r/cRBtGuLlF6
+    "(\\p{L}+)(n't\\b)", // http://rubular.com/r/coeYJFt8eM
+    "(\\p{L}+)('{1}\\p{L}+)", // http://rubular.com/r/N84PYwYjQp
+    "((?:\\p{L}+[^\\s\\p{L}]{1})+\\p{L}+)", // http://rubular.com/r/wOvQcey9e3
+    "(\\p{L}+)" // basic word token
   ))
   /** These catch everything before and after a word, as a separate token*/
-  setDefault(prefixPattern, "\\A([^\\s\\w\\$\\.]*)")
-  setDefault(suffixPattern, "([^\\s\\w]?)([^\\s\\w]*)\\z")
+  setDefault(prefixPattern, "\\A([^\\s\\p{L}$\\.]*)")
+  setDefault(suffixPattern, "([^\\s\\p{L}]?)([^\\s\\p{L}]*)\\z")
   setDefault(targetPattern, "\\S+")
 
   private val PROTECT_STR = "ↈ"
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
index 001dca7d8eaf99..481abd7c9baa8a 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/TokenizerTestSpec.scala
@@ -22,9 +22,9 @@ class TokenizerTestSpec extends FlatSpec with TokenizerBehaviors {
   }
 
 
-  val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
+  val targetText = "Hello, I won't be from New York in the U.S.A. (and you know it héroe). Give me my horse! or $100 bucks 'He said', I'll defeat markus-crassus."
   val expected = Array(
-    "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", ")", ".",
+    "Hello", ",", "I", "wo", "n't", "be", "from", "New York", "in", "the", "U.S.A.", "(", "and", "you", "know", "it", "héroe", ")", ".",
     "Give", "me", "my", "horse", "!", "or", "$100", "bucks", "'", "He", "said", "'", ",", "I", "'ll", "defeat", "markus-crassus", "."
   )