JohnSnowLabs · saif-ellafi · Oct 30, 2017 · Oct 21, 2017 · Oct 24, 2017 · Oct 24, 2017
diff --git a/build.sbt b/build.sbt
@@ -24,6 +24,8 @@ spIncludeMaven := false
 
 spAppendScalaVersion := false
 
+resolvers += "Maven Central" at "http://central.maven.org/maven2/"
+
 assemblyOption in assembly := (assemblyOption in assembly).value.copy(
   includeScala = false
 )
@@ -55,7 +57,8 @@ lazy val testDependencies = Seq(
 )
 
 lazy val utilDependencies = Seq(
-  "com.typesafe" % "config" % "1.3.0"
+  "com.typesafe" % "config" % "1.3.0",
+  "org.fusesource.leveldbjni" % "leveldbjni-all" % "1.8"
 )
 
 lazy val root = (project in file("."))

diff --git a/python/example/crf-ner/ner.ipynb b/python/example/crf-ner/ner.ipynb
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import sys\n",
@@ -20,7 +22,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "spark = SparkSession.builder \\\n",
@@ -44,23 +48,27 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "from pyspark.sql.types import *\n",
     "\n",
     "class Annotation:\n",
-    "    def __init__(self, annotatorType, begin, end, metadata):\n",
+    "    def __init__(self, annotatorType, begin, end, result, metadata):\n",
     "        self.annotatorType = annotatorType\n",
     "        self.begin = begin\n",
     "        self.end = end\n",
+    "        self.result = result\n",
     "        self.metadata = metadata\n",
     "\n",
     "        \n",
     "annotation_schema = StructType([\n",
     "    StructField(\"annotatorType\", StringType()),\n",
     "    StructField(\"begin\", IntegerType(), False),\n",
     "    StructField(\"end\", IntegerType(), False),\n",
+    "    StructField(\"result\", StringType()),\n",
     "    StructField(\"metadata\", MapType(StringType(), StringType()))\n",
     "])\n",
     "    \n",
@@ -91,7 +99,7 @@
     "                doc = doc + word\n",
     "                end = len(doc) - 1\n",
     "                ner = items[3]\n",
-    "                labels.append(Annotation(\"named_entity\", begin, end, {\"tag\": ner}))\n",
+    "                labels.append(Annotation(\"named_entity\", begin, end, ner, {}))\n",
     "\n",
     "    if doc:\n",
     "        result.append((doc, labels))\n",
@@ -110,7 +118,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import time\n",
@@ -129,17 +139,17 @@
     "      .setOutputCol(\"token\")\n",
     "\n",
     "    posTagger = PerceptronApproach()\\\n",
-    "      .setCorpusPath(\"../../../src/test/resources/anc-pos-corpus/\")\\\n",
+    "      .setCorpusPath(\"../../../src/main/resources/anc-pos-corpus/\")\\\n",
     "      .setIterations(5)\\\n",
     "      .setInputCols([\"token\", \"document\"])\\\n",
     "      .setOutputCol(\"pos\")\n",
     "\n",
-    "    nerTagger = CrfBasedNer()\\\n",
+    "    nerTagger = NerCrfApproach()\\\n",
     "      .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n",
     "      .setLabelColumn(\"label\")\\\n",
     "      .setOutputCol(\"ner\")\\\n",
     "      .setMinEpochs(1)\\\n",
-    "      .setMaxEpochs(300)\\\n",
+    "      .setMaxEpochs(10)\\\n",
     "      .setLossEps(1e-3)\\\n",
     "      .setDicts([\"../../../src/main/resources/ner-corpus/dict.txt\"])\\\n",
     "      .setL2(1)\\\n",
@@ -177,7 +187,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from pyspark.sql.functions import col, udf, explode\n",
@@ -235,12 +247,12 @@
     "        label = line[0]\n",
     "        ner = line[1]\n",
     "    \n",
-    "        ner = {(a[\"begin\"], a[\"end\"]):a[\"metadata\"][\"tag\"] for a in ner}\n",
+    "        ner = {(a[\"begin\"], a[\"end\"]):a[\"result\"] for a in ner}\n",
     "\n",
     "        for a in label:\n",
     "            key = (a[\"begin\"], a[\"end\"])\n",
     "\n",
-    "            label = a[\"metadata\"][\"tag\"].strip()\n",
+    "            label = a[\"result\"].strip()\n",
     "            predictedLabel = ner.get(key, \"O\").strip()\n",
     "            \n",
     "            if key not in ner and ignore_tokenize_misses:\n",
@@ -284,7 +296,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import os.path\n",
@@ -298,7 +312,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "model = train_model(train_file)"
@@ -307,7 +324,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "print(\"\\nQuality on training data\")\n",
@@ -323,7 +342,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "df = get_dataset_for_analysis(test_file_a, model, spark)\n",
@@ -333,7 +354,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "get_pipeline().write().overwrite().save(\"./crf_pipeline\")\n",
@@ -344,6 +367,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
+    "collapsed": true,
     "scrolled": false
    },
    "outputs": [],
@@ -357,7 +381,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print(\"\\nQuality on training data\")\n",
@@ -374,7 +400,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [default]",
    "language": "python",
    "name": "python3"
   },

diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
@@ -458,7 +458,7 @@ class NorvigSweetingModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorPr
 
 
 
-class CrfBasedNer(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
+class NerCrfApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
     labelColumn = Param(Params._dummy(),
                      "labelColumn",
                      "Column with label per each token",
@@ -477,6 +477,7 @@ class CrfBasedNer(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProper
     randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
 
     dicts = Param(Params._dummy(), "dicts", "Additional dictionaries paths to use as a features", TypeConverters.toListString)
+    datasetPath = Param(Params._dummy(), "datasetPath", "Path to dataset. If path is empty will use dataset passed to train as usual Spark Pipeline stage", TypeConverters.toString)
 
     def setLabelColumn(self, value):
         self._set(labelColumn=value)
@@ -522,13 +523,17 @@ def setDicts(self, dictionaries):
         self._set(dicts = dictionaries)
         return self
 
+    def setDatasetPath(self, path):
+        self._set(datasetPath = path)
+        return self
+
     def _create_model(self, java_model):
-      return CrfBasedNerModel(java_model)
+      return NerCrfModel(java_model)
 
     @keyword_only
     def __init__(self):
-        super(CrfBasedNer, self).__init__()
-        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.ner.crf.CrfBasedNer", self.uid)
+        super(NerCrfApproach, self).__init__()
+        self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach", self.uid)
 
         self._setDefault(
             minEpochs = 0,
@@ -540,5 +545,5 @@ def __init__(self):
         )
 
 
-class CrfBasedNerModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
-    name = "CrfBasedNerModel"
+class NerCrfModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
+    name = "NerCrfModel"
diff --git a/src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala b/src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala
@@ -71,7 +71,7 @@ class DatasetEncoder(val startLabel: String = "@#Start") {
   def getFeatures(prevLabel: String = startLabel,
                   label: String,
                   binaryAttrs: Seq[String],
-                  numAttrs: Seq[(String, Float)]): (Int, SparseArray) = {
+                  numAttrs: Seq[Float]): (Int, SparseArray) = {
     val labelId = getLabel(label)
 
     val binFeature = binaryAttrs.map{attr =>
@@ -80,8 +80,8 @@ class DatasetEncoder(val startLabel: String = "@#Start") {
       (attrId, 1f)
     }
 
-    val numFeatures = numAttrs.map{case(attr, value) => {
-      val attrId = getAttr(attr, true)
+    val numFeatures = numAttrs.zipWithIndex.map{case(value, idx) => {
+      val attrId = getAttr("num" + idx, true)
       addAttrFeature(labelId, attrId, value)
       (attrId, value)
     }}

diff --git a/src/main/scala/com/johnsnowlabs/ml/crf/DatasetReader.scala b/src/main/scala/com/johnsnowlabs/ml/crf/DatasetReader.scala
@@ -1,14 +1,17 @@
 package com.johnsnowlabs.ml.crf
 
 import java.io.FileInputStream
+
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
+
+import scala.collection.TraversableOnce
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
 
 
 case class TextSentenceLabels(labels: Seq[String])
 case class TextSentenceAttrs(words: Seq[WordAttrs])
-case class WordAttrs(attrs: Seq[(String, String)])
+case class WordAttrs(strAttrs: Seq[(String, String)], numAttrs: Array[Float] = Array.empty)
 
 
 object DatasetReader {
@@ -23,7 +26,7 @@ object DatasetReader {
     }
   }
 
-  private def readWithLabels(file: String, skipLines: Int = 0): Iterator[(TextSentenceLabels, TextSentenceAttrs)] = {
+  private def readWithLabels(file: String, skipLines: Int = 0): TraversableOnce[(TextSentenceLabels, TextSentenceAttrs)] = {
     val lines = getSource(file)
       .getLines()
       .drop(skipLines)
@@ -66,22 +69,22 @@ object DatasetReader {
     }
   }
 
-  def encodeDataset(source: Iterator[(TextSentenceLabels, TextSentenceAttrs)]): CrfDataset = {
+  def encodeDataset(source: TraversableOnce[(TextSentenceLabels, TextSentenceAttrs)]): CrfDataset = {
     val metadata = new DatasetEncoder()
 
     val instances = source.map{case (textLabels, textSentence) =>
       var prevLabel = metadata.startLabel
       val (labels, features) = textLabels.labels.zip(textSentence.words)
         .map{case (label, word) =>
-          val attrs = word.attrs.map(a => a._1 + "=" + a._2)
-          val (labelId, features) = metadata.getFeatures(prevLabel, label, attrs, Seq.empty)
+          val attrs = word.strAttrs.map(a => a._1 + "=" + a._2)
+          val (labelId, features) = metadata.getFeatures(prevLabel, label, attrs, word.numAttrs)
           prevLabel = label
 
           (labelId, features)
         }.unzip
 
       (InstanceLabels(labels), Instance(features))
-    }.toList
+    }.toArray
 
     CrfDataset(instances, metadata.getMetadata)
   }
@@ -93,12 +96,20 @@ object DatasetReader {
 
   def encodeSentence(sentence: TextSentenceAttrs, metadata: DatasetMetadata): Instance = {
     val items = sentence.words.map{word =>
-      val attrIds = word.attrs.flatMap { case (name, value) =>
+      val strAttrs = word.strAttrs.flatMap { case (name, value) =>
         val key = name + "=" + value
         metadata.attr2Id.get(key)
+      }.map((_, 1f))
+
+      val numAttrs = word.numAttrs.zipWithIndex.flatMap {case(value, idx) =>
+        val key = "num" + idx
+        val attr = metadata.attr2Id.get(key)
+        attr.map(attrName => (attrName, value))
       }
 
-      val attrValues = attrIds.sortBy(id => id).distinct.map(id => (id, 1f)).toArray
+      val id2value = strAttrs ++ numAttrs
+
+      val attrValues = id2value.sortBy(id => id._1).distinct.toArray
       new SparseArray(attrValues)
     }
 
@@ -111,7 +122,7 @@ object DatasetReader {
     encodeDataset(textDataset)
   }
 
-  def readAndEncode(file: String, skipLines: Int, metadata: DatasetMetadata): Iterator[(InstanceLabels, Instance)] = {
+  def readAndEncode(file: String, skipLines: Int, metadata: DatasetMetadata): TraversableOnce[(InstanceLabels, Instance)] = {
     val textDataset = readWithLabels(file, skipLines)
 
     textDataset.map{case (sourceLabels, sourceInstance) =>