Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Path in Ner Approach #39

Merged
merged 5 commits into from
Oct 30, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ spIncludeMaven := false

spAppendScalaVersion := false

resolvers += "Maven Central" at "http://central.maven.org/maven2/"

assemblyOption in assembly := (assemblyOption in assembly).value.copy(
includeScala = false
)
Expand Down Expand Up @@ -55,7 +57,8 @@ lazy val testDependencies = Seq(
)

lazy val utilDependencies = Seq(
"com.typesafe" % "config" % "1.3.0"
"com.typesafe" % "config" % "1.3.0",
"org.fusesource.leveldbjni" % "leveldbjni-all" % "1.8"
)

lazy val root = (project in file("."))
Expand Down
64 changes: 45 additions & 19 deletions python/example/crf-ner/ner.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
Expand All @@ -20,7 +22,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"spark = SparkSession.builder \\\n",
Expand All @@ -44,23 +48,27 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark.sql.types import *\n",
"\n",
"class Annotation:\n",
" def __init__(self, annotatorType, begin, end, metadata):\n",
" def __init__(self, annotatorType, begin, end, result, metadata):\n",
" self.annotatorType = annotatorType\n",
" self.begin = begin\n",
" self.end = end\n",
" self.result = result\n",
" self.metadata = metadata\n",
"\n",
" \n",
"annotation_schema = StructType([\n",
" StructField(\"annotatorType\", StringType()),\n",
" StructField(\"begin\", IntegerType(), False),\n",
" StructField(\"end\", IntegerType(), False),\n",
" StructField(\"result\", StringType()),\n",
" StructField(\"metadata\", MapType(StringType(), StringType()))\n",
"])\n",
" \n",
Expand Down Expand Up @@ -91,7 +99,7 @@
" doc = doc + word\n",
" end = len(doc) - 1\n",
" ner = items[3]\n",
" labels.append(Annotation(\"named_entity\", begin, end, {\"tag\": ner}))\n",
" labels.append(Annotation(\"named_entity\", begin, end, ner, {}))\n",
"\n",
" if doc:\n",
" result.append((doc, labels))\n",
Expand All @@ -110,7 +118,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import time\n",
Expand All @@ -129,17 +139,17 @@
" .setOutputCol(\"token\")\n",
"\n",
" posTagger = PerceptronApproach()\\\n",
" .setCorpusPath(\"../../../src/test/resources/anc-pos-corpus/\")\\\n",
" .setCorpusPath(\"../../../src/main/resources/anc-pos-corpus/\")\\\n",
" .setIterations(5)\\\n",
" .setInputCols([\"token\", \"document\"])\\\n",
" .setOutputCol(\"pos\")\n",
"\n",
" nerTagger = CrfBasedNer()\\\n",
" nerTagger = NerCrfApproach()\\\n",
" .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n",
" .setLabelColumn(\"label\")\\\n",
" .setOutputCol(\"ner\")\\\n",
" .setMinEpochs(1)\\\n",
" .setMaxEpochs(300)\\\n",
" .setMaxEpochs(10)\\\n",
" .setLossEps(1e-3)\\\n",
" .setDicts([\"../../../src/main/resources/ner-corpus/dict.txt\"])\\\n",
" .setL2(1)\\\n",
Expand Down Expand Up @@ -177,7 +187,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.sql.functions import col, udf, explode\n",
Expand Down Expand Up @@ -235,12 +247,12 @@
" label = line[0]\n",
" ner = line[1]\n",
" \n",
" ner = {(a[\"begin\"], a[\"end\"]):a[\"metadata\"][\"tag\"] for a in ner}\n",
" ner = {(a[\"begin\"], a[\"end\"]):a[\"result\"] for a in ner}\n",
"\n",
" for a in label:\n",
" key = (a[\"begin\"], a[\"end\"])\n",
"\n",
" label = a[\"metadata\"][\"tag\"].strip()\n",
" label = a[\"result\"].strip()\n",
" predictedLabel = ner.get(key, \"O\").strip()\n",
" \n",
" if key not in ner and ignore_tokenize_misses:\n",
Expand Down Expand Up @@ -284,7 +296,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os.path\n",
Expand All @@ -298,7 +312,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"model = train_model(train_file)"
Expand All @@ -307,7 +324,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(\"\\nQuality on training data\")\n",
Expand All @@ -323,7 +342,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = get_dataset_for_analysis(test_file_a, model, spark)\n",
Expand All @@ -333,7 +354,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"get_pipeline().write().overwrite().save(\"./crf_pipeline\")\n",
Expand All @@ -344,6 +367,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
Expand All @@ -357,7 +381,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"print(\"\\nQuality on training data\")\n",
Expand All @@ -374,7 +400,7 @@
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
Expand Down
17 changes: 11 additions & 6 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ class NorvigSweetingModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorPr



class CrfBasedNer(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
class NerCrfApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
labelColumn = Param(Params._dummy(),
"labelColumn",
"Column with label per each token",
Expand All @@ -477,6 +477,7 @@ class CrfBasedNer(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProper
randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)

dicts = Param(Params._dummy(), "dicts", "Additional dictionaries paths to use as a features", TypeConverters.toListString)
datasetPath = Param(Params._dummy(), "datasetPath", "Path to dataset. If path is empty will use dataset passed to train as usual Spark Pipeline stage", TypeConverters.toString)

def setLabelColumn(self, value):
self._set(labelColumn=value)
Expand Down Expand Up @@ -522,13 +523,17 @@ def setDicts(self, dictionaries):
self._set(dicts = dictionaries)
return self

def setDatasetPath(self, path):
self._set(datasetPath = path)
return self

def _create_model(self, java_model):
return CrfBasedNerModel(java_model)
return NerCrfModel(java_model)

@keyword_only
def __init__(self):
super(CrfBasedNer, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.ner.crf.CrfBasedNer", self.uid)
super(NerCrfApproach, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach", self.uid)

self._setDefault(
minEpochs = 0,
Expand All @@ -540,5 +545,5 @@ def __init__(self):
)


class CrfBasedNerModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
name = "CrfBasedNerModel"
class NerCrfModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties):
name = "NerCrfModel"
6 changes: 3 additions & 3 deletions src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class DatasetEncoder(val startLabel: String = "@#Start") {
def getFeatures(prevLabel: String = startLabel,
label: String,
binaryAttrs: Seq[String],
numAttrs: Seq[(String, Float)]): (Int, SparseArray) = {
numAttrs: Seq[Float]): (Int, SparseArray) = {
val labelId = getLabel(label)

val binFeature = binaryAttrs.map{attr =>
Expand All @@ -80,8 +80,8 @@ class DatasetEncoder(val startLabel: String = "@#Start") {
(attrId, 1f)
}

val numFeatures = numAttrs.map{case(attr, value) => {
val attrId = getAttr(attr, true)
val numFeatures = numAttrs.zipWithIndex.map{case(value, idx) => {
val attrId = getAttr("num" + idx, true)
addAttrFeature(labelId, attrId, value)
(attrId, value)
}}
Expand Down
29 changes: 20 additions & 9 deletions src/main/scala/com/johnsnowlabs/ml/crf/DatasetReader.scala
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
package com.johnsnowlabs.ml.crf

import java.io.FileInputStream

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream

import scala.collection.TraversableOnce
import scala.collection.mutable.ArrayBuffer
import scala.io.Source


case class TextSentenceLabels(labels: Seq[String])
case class TextSentenceAttrs(words: Seq[WordAttrs])
case class WordAttrs(attrs: Seq[(String, String)])
case class WordAttrs(strAttrs: Seq[(String, String)], numAttrs: Array[Float] = Array.empty)


object DatasetReader {
Expand All @@ -23,7 +26,7 @@ object DatasetReader {
}
}

private def readWithLabels(file: String, skipLines: Int = 0): Iterator[(TextSentenceLabels, TextSentenceAttrs)] = {
private def readWithLabels(file: String, skipLines: Int = 0): TraversableOnce[(TextSentenceLabels, TextSentenceAttrs)] = {
val lines = getSource(file)
.getLines()
.drop(skipLines)
Expand Down Expand Up @@ -66,22 +69,22 @@ object DatasetReader {
}
}

def encodeDataset(source: Iterator[(TextSentenceLabels, TextSentenceAttrs)]): CrfDataset = {
def encodeDataset(source: TraversableOnce[(TextSentenceLabels, TextSentenceAttrs)]): CrfDataset = {
val metadata = new DatasetEncoder()

val instances = source.map{case (textLabels, textSentence) =>
var prevLabel = metadata.startLabel
val (labels, features) = textLabels.labels.zip(textSentence.words)
.map{case (label, word) =>
val attrs = word.attrs.map(a => a._1 + "=" + a._2)
val (labelId, features) = metadata.getFeatures(prevLabel, label, attrs, Seq.empty)
val attrs = word.strAttrs.map(a => a._1 + "=" + a._2)
val (labelId, features) = metadata.getFeatures(prevLabel, label, attrs, word.numAttrs)
prevLabel = label

(labelId, features)
}.unzip

(InstanceLabels(labels), Instance(features))
}.toList
}.toArray

CrfDataset(instances, metadata.getMetadata)
}
Expand All @@ -93,12 +96,20 @@ object DatasetReader {

def encodeSentence(sentence: TextSentenceAttrs, metadata: DatasetMetadata): Instance = {
val items = sentence.words.map{word =>
val attrIds = word.attrs.flatMap { case (name, value) =>
val strAttrs = word.strAttrs.flatMap { case (name, value) =>
val key = name + "=" + value
metadata.attr2Id.get(key)
}.map((_, 1f))

val numAttrs = word.numAttrs.zipWithIndex.flatMap {case(value, idx) =>
val key = "num" + idx
val attr = metadata.attr2Id.get(key)
attr.map(attrName => (attrName, value))
}

val attrValues = attrIds.sortBy(id => id).distinct.map(id => (id, 1f)).toArray
val id2value = strAttrs ++ numAttrs

val attrValues = id2value.sortBy(id => id._1).distinct.toArray
new SparseArray(attrValues)
}

Expand All @@ -111,7 +122,7 @@ object DatasetReader {
encodeDataset(textDataset)
}

def readAndEncode(file: String, skipLines: Int, metadata: DatasetMetadata): Iterator[(InstanceLabels, Instance)] = {
def readAndEncode(file: String, skipLines: Int, metadata: DatasetMetadata): TraversableOnce[(InstanceLabels, Instance)] = {
val textDataset = readWithLabels(file, skipLines)

textDataset.map{case (sourceLabels, sourceInstance) =>
Expand Down
Loading