From 6b72c4e3ee24e46d09bb64dad5b5109cefe42cbc Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 24 Nov 2017 11:05:47 -0300 Subject: [PATCH 01/55] added unit test for windowing - work in progress --- .../com/johnsnowlabs/nlp/AnnotatorType.scala | 1 + .../logreg/AssertionLogRegApproach.scala | 57 ++++++++++++++++++ .../logreg/AssertionLogRegModel.scala | 27 +++++++++ .../assertion/logreg/Windowing.scala | 41 +++++++++++++ .../assertion/AssertionStatusTest.scala | 14 +++++ .../assertion/SentenceWindowingTest.scala | 60 +++++++++++++++++++ 6 files changed, 200 insertions(+) create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala index cda482e20a58e4..68490a053016ac 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala @@ -12,5 +12,6 @@ object AnnotatorType { val NAMED_ENTITY = "named_entity" val NEGEX = "negex" val DEPENDENCY = "dependency" + val ASSERTION = "assertion" val DUMMY = "dummy" } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala new file mode 100644 index 00000000000000..c7d79f42c32a2e --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -0,0 +1,57 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +import com.johnsnowlabs.nlp.AnnotatorType._ +import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings +import com.johnsnowlabs.nlp.{AnnotatorApproach} +import org.apache.spark.sql.functions.udf +import org.apache.spark.ml.param.Param +import org.apache.spark.sql.Dataset +import org.apache.spark.ml.classification.LogisticRegression + +/** + * Created by jose on 22/11/17. + */ +class AssertionLogRegApproach extends AnnotatorApproach[AssertionLogRegModel] + with AnnotatorWithWordEmbeddings with Windowing { + + override val requiredAnnotatorTypes = Array(TOKEN) + override val description: String = "Clinical Text Status Assertion" + override val annotatorType: AnnotatorType = null + override val uid: String = "" + + // example of possible values, 'Negated', 'Affirmed', 'Historical' + val labelColumn = new Param[String](this, "labelColumn", "Column with one label per document") + + // the document where we're extracting the assertion + val documentColumn = new Param[String](this, "documentColumn", "Column with one label per document") + + // the target term, that must appear capitalized in the document, e.g., 'diabetes' + val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") + + override val (before, after) = (10, 5) + + override def train(dataset: Dataset[_]): AssertionLogRegModel = { + import dataset.sqlContext.implicits._ + val processed = dataset.toDF.select(applyWindowUdf($"documentColumn", $"targetColumn") + .as("features"), $"labelColumn".as("label")) + + val lr = new LogisticRegression() + .setMaxIter(10) + .setRegParam(0.3) + .setElasticNetParam(0.8) + lr.fit(processed) + + AssertionLogRegModel(lr) + } + + private def applyWindowUdf = + udf {(doc:String, target:String) => applyWindow(doc, target)} + + + private def getFeatureVector(doc:String, target:String) = { + val datapoint = applyWindow(doc, target) + datapoint.flatMap(embeddings.get.getEmbeddings) + } + + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala new file mode 100644 index 00000000000000..bf861818ccdbc7 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -0,0 +1,27 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType} +import com.johnsnowlabs.nlp.embeddings.ModelWithWordEmbeddings +import org.apache.spark.ml.classification.LogisticRegression + +/** + * Created by jose on 22/11/17. + */ +class AssertionLogRegModel(model:LogisticRegression) extends AnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings{ + + /** + * takes a document and annotations and produces new annotations of this annotator's annotation type + * + * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return any number of annotations processed for every input annotation. Not necessary one to one relationship + */ + override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = ??? + + override val annotatorType: AnnotatorType = AnnotatorType.ASSERTION + override val requiredAnnotatorTypes: Array[String] = null + override val uid: String = "" +} + +object AssertionLogRegModel { + def apply(model: LogisticRegression): AssertionLogRegModel = new AssertionLogRegModel(model) +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala new file mode 100644 index 00000000000000..5d946ed15112e3 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -0,0 +1,41 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +/** + * Created by jose on 24/11/17. + */ +trait Windowing { + + val before : Int + val after : Int + + /* apply window, pad/truncate sentence according to window */ + protected def applyWindow(doc: String, target:String) = { + val sentSplits = doc.split(target).map(_.trim) + val targetPart = target.split(" ") + + val leftPart = if (sentSplits.head.isEmpty) Array[String]() + else sentSplits.head.split(" ") + + val rightPart = if (sentSplits.length == 1) Array[String]() + else sentSplits.last.split(" ") + + val (start, leftPadding) = + if(leftPart.size >= before) + (leftPart.size - before, Array[String]()) + else + (0, Array.fill(before - leftPart.length)("empty_marker")) + + val (end, rightPadding) = + if(targetPart.length - 1 + rightPart.length <= after) + (rightPart.length, Array.fill(after - (targetPart.length - 1 + rightPart.length))("empty_marker")) + else + (after - targetPart.length, Array[String]()) + + val leftContext = leftPart.slice(start, leftPart.length) + val rightContext = rightPart.slice(0, end + 1) + + leftPadding ++ leftContext ++ targetPart ++ rightContext ++ rightPadding + + } + +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala new file mode 100644 index 00000000000000..aeac41b14163ce --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala @@ -0,0 +1,14 @@ +package com.johnsnowlabs.nlp.annotators.assertion + +import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach + +/** + * Created by jose on 22/11/17. + */ +class AssertionStatusTest extends App { + + object assertion extends AssertionLogRegApproach { + + } + +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala new file mode 100644 index 00000000000000..9b26e563e6ccbe --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala @@ -0,0 +1,60 @@ +package com.johnsnowlabs.nlp.annotators.assertion + +import com.johnsnowlabs.nlp.AnnotatorType +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{AssertionLogRegApproach, Windowing} +import org.scalatest.{FlatSpec, Matchers} + +/** + * Created by jose on 24/11/17. + */ +class SentenceWindowingTest extends FlatSpec with Matchers { + + trait Scope extends Windowing { + override val before: Int = 5 + override val after: Int = 5 + } + + "sentences" should "be correctly padded" in new Scope { + val doc = "the cat eats fish" + val target = "cat" + val result = applyWindow(doc, target) + val expected = Array("empty_marker", "empty_marker", "empty_marker", "empty_marker", + "the", "cat", "eats", "fish", "empty_marker", "empty_marker", "empty_marker") + + assert(expected === result) + } + + "sentences" should "be correctly truncated" in new Scope { + val doc = "it has been said that the cat eats fish while listens to the rain" + val target = "cat" + val expected = "has been said that the cat eats fish while listens to".split(" ") + val result = applyWindow(doc, target) + assert(expected === result) + } + + "multi word targets" should "be correctly identified" in new Scope{ + val doc = "it has been said that the cat eats fish while listens to the rain" + val target = "the cat" + val expected = "it has been said that the cat eats fish while listens".split(" ") + val result = applyWindow(doc, target) + assert(expected === result) + } + + "targets in the border" should "be correctly identified - left" in new Scope { + val doc = "the cat eats fish while listens to the rain" + val target = "the cat" + val expected = ("empty_marker empty_marker empty_marker empty_marker empty_marker " + + "the cat eats fish while listens").split(" ") + val result = applyWindow(doc, target) + assert(expected === result) + } + + "targets in the border" should "be correctly identified - right" in new Scope { + val doc = "it has been said that the cat" + val target = "the cat" + val expected = "it has been said that the cat empty_marker empty_marker empty_marker empty_marker ".split(" ") + val result = applyWindow(doc, target) + assert(expected === result) + } + +} From ddd5a32be716cb1b0d28cdb3addd674d6a2828e3 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 29 Nov 2017 13:26:29 -0300 Subject: [PATCH 02/55] added script for converting negex dataset to CSV --- .../johnsnowlabs/ml/logreg/convert_negex.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py b/src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py new file mode 100644 index 00000000000000..4ef5e3afca9ac2 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py @@ -0,0 +1,25 @@ +''' + +Simple script to convert the negex files to CSV. +Merge all annotations to form a single classification problem. + +''' + +import sys, re + +if len(sys.argv) < 2: + print 'Which file?' + exit(1) + +delimiter = ',' + +with open(sys.argv[1]) as input, open(sys.argv[1] + '.csv', 'w') as output: + total = 0 + output.write('sentence\ttarget\tlabel\n') + for line in input: + total += 1 + chunks = line.split('\t') + if len(chunks[2].split(' ')) > 4: + continue + print(chunks) + output.write(chunks[2] + '\t' + chunks[1] + '\t' + chunks[3] + '\n') From e7f2b96c063df442fa5963dd1d178400d75cdb39 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 29 Nov 2017 17:36:52 -0300 Subject: [PATCH 03/55] work in progress - fixed problem with embeddings --- .../logreg/AssertionLogRegApproach.scala | 16 ++---- .../assertion/logreg/Windowing.scala | 36 +++++++++++-- .../nlp/datasets/CoNLL2003NerReader.scala | 1 - .../embeddings/WordEmbeddingsIndexer.scala | 6 ++- .../ml/logreg/NegexDatasetLogRegTest.scala | 44 ++++++++++++++++ .../ml/logreg/NegexDatasetReader.scala | 52 +++++++++++++++++++ .../assertion/SentenceWindowingTest.scala | 5 +- 7 files changed, 139 insertions(+), 21 deletions(-) create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index c7d79f42c32a2e..19c953cc2a2c3c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -32,9 +32,12 @@ class AssertionLogRegApproach extends AnnotatorApproach[AssertionLogRegModel] override def train(dataset: Dataset[_]): AssertionLogRegModel = { import dataset.sqlContext.implicits._ + + /* apply UDF to fix the length of each document */ val processed = dataset.toDF.select(applyWindowUdf($"documentColumn", $"targetColumn") - .as("features"), $"labelColumn".as("label")) + .as("window"), $"labelColumn".as("label")) + /* TODO: pick the parameters you want to expose*/ val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.3) @@ -44,14 +47,5 @@ class AssertionLogRegApproach extends AnnotatorApproach[AssertionLogRegModel] AssertionLogRegModel(lr) } - private def applyWindowUdf = - udf {(doc:String, target:String) => applyWindow(doc, target)} - - - private def getFeatureVector(doc:String, target:String) = { - val datapoint = applyWindow(doc, target) - datapoint.flatMap(embeddings.get.getEmbeddings) - } - - + override val embeddingsPath: String = ??? } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 5d946ed15112e3..ce3977de1cb46b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -1,15 +1,24 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg +import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} +import org.apache.spark.sql.functions._ +import org.apache.spark.ml.linalg.{Vector, Vectors} + /** * Created by jose on 24/11/17. */ trait Windowing { - val before : Int - val after : Int + val before : Int = 6 + val after : Int = 18 + val embeddingsPath: String = "override me" + + // hard-coded stuff + lazy val wordVectors: Option[WordEmbeddings] = None //= Some(WordEmbeddings(embeddingsPath, 200)) + /* apply window, pad/truncate sentence according to window */ - protected def applyWindow(doc: String, target:String) = { + def applyWindow(doc: String, target:String) : Array[String] = { val sentSplits = doc.split(target).map(_.trim) val targetPart = target.split(" ") @@ -35,7 +44,26 @@ trait Windowing { val rightContext = rightPart.slice(0, end + 1) leftPadding ++ leftContext ++ targetPart ++ rightContext ++ rightPadding - } + /* same as above, but convert the resulting text in a vector */ + def applyWindowUdf = + udf {(doc:String, target:String) => + val tmp : Array[Double] = applyWindow(doc.toLowerCase, target.toLowerCase).flatMap(wordVectors.get.getEmbeddings).map(_.toDouble) + + /* TODO tmp sanity check - remove */ + if (tmp.length != (before + after + 1) * 200) + println(doc) + + if (tmp.contains(Double.NaN)) + println(doc) + + Vectors.dense(tmp) + + } + + /* Column label must be of type NumericType but was actually of type StringType. */ + def labelToNumber() = udf { label:String => + if (label.equals("Affirmed")) 1.0 else 0.0 + } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/datasets/CoNLL2003NerReader.scala b/src/main/scala/com/johnsnowlabs/nlp/datasets/CoNLL2003NerReader.scala index dd3b7797ae8288..978ec0ea7dcab2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/datasets/CoNLL2003NerReader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/datasets/CoNLL2003NerReader.scala @@ -36,7 +36,6 @@ class CoNLL2003NerReader(wordEmbeddingsFile: String, case WordEmbeddingsFormat.SparkNlp => fileDb = wordEmbeddingsFile } - } if (new File(fileDb).exists()) { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsIndexer.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsIndexer.scala index a01580f0c69e45..01ae862a8d52bf 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsIndexer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsIndexer.scala @@ -1,8 +1,10 @@ package com.johnsnowlabs.nlp.embeddings import java.io._ -import java.nio.ByteBuffer +import java.nio.{ByteBuffer, ByteOrder} + import org.slf4j.LoggerFactory + import scala.io.Source @@ -10,6 +12,7 @@ object WordEmbeddingsIndexer { private[embeddings] def toBytes(embeddings: Array[Float]): Array[Byte] = { val buffer = ByteBuffer.allocate(embeddings.length * 4) + buffer.order(ByteOrder.LITTLE_ENDIAN) for (value <- embeddings) { buffer.putFloat(value) } @@ -18,6 +21,7 @@ object WordEmbeddingsIndexer { private[embeddings] def fromBytes(source: Array[Byte]): Array[Float] = { val wrapper = ByteBuffer.wrap(source) + wrapper.order(ByteOrder.LITTLE_ENDIAN) val result = Array.fill[Float](source.length / 4)(0f) for (i <- 0 until result.length) { diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala new file mode 100644 index 00000000000000..c26e2752113f02 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala @@ -0,0 +1,44 @@ +package com.johnsnowlabs.ml.logreg + +import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing +import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.sql.{DataFrame, SparkSession} + +/** + * Created by jose on 24/11/17. + */ +object NegexDatasetLogRegTest extends App { + + + /* local Spark for test */ + implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() + val datasetPath = "rsAnnotations-1-120-random.txt.csv" + + val embeddingsDims = 200 + val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val reader = new NegexDatasetReader(embeddingsFile, embeddingsDims) + + def train(datasetPath: String) = { + import spark.implicits._ + val ds = reader.readNegexDataset(datasetPath) + + val lr = new LogisticRegression() + .setMaxIter(8) + .setRegParam(0.01) + .setElasticNetParam(0.8) + lr.fit(ds) + + } + + val model = train(datasetPath) + + // test on train data, just as a 'smoke test' + val ds = reader.readNegexDataset(datasetPath) + val result = model.transform(ds) + val total = result.count + val correct = result.filter(r => r.getAs[Double]("prediction") == r.getAs[Double]("label")).count + + println("Accuracy: " + correct.toDouble / total.toDouble) + println(s"Coefficients: ${model.coefficients} Intercept: ${model.intercept}") +} diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala new file mode 100644 index 00000000000000..887b319670526f --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala @@ -0,0 +1,52 @@ +package com.johnsnowlabs.ml.logreg + +import java.io.File + +import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing +import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} +import org.apache.spark.sql._ + +/** + * Reader for this dataset, + * https://github.com/mongoose54/negex/blob/master/genConText/rsAnnotations-1-120-random.txt + */ +class NegexDatasetReader(wordEmbeddingsFile: String, wordEmbeddingsNDims: Int) extends Serializable with Windowing{ + + var fileDb = wordEmbeddingsFile + ".db" + WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) + + //private val mappings = Map("Affirmed" -> 0.0, "Negated" -> 1.0,"Historical" -> 2.0, "Family" -> 3.0) + + override val (before, after) = (5, 8) + + /* TODO duplicated logic, consider relocation to common place */ + override lazy val wordVectors : Option[WordEmbeddings] = + if (wordEmbeddingsFile != null) { + require(new File(wordEmbeddingsFile).exists()) + val fileDb = wordEmbeddingsFile + ".db" + if (!new File(fileDb).exists()) + WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) + + if (new File(fileDb).exists()) + Some(WordEmbeddings(fileDb, wordEmbeddingsNDims)) + else + None + } + else + None + + def readNegexDataset(datasetPath: String)(implicit session:SparkSession) = { + import session.implicits._ + + val dataset = session.read.format("com.databricks.spark.csv"). + option("delimiter", "\t"). + option("header", "true"). + load(datasetPath) + + /* apply UDF to fix the length of each document */ + dataset.select(applyWindowUdf($"sentence", $"target") + .as("features"), labelToNumber()($"label").as("label")) + } + + override val embeddingsPath: String = wordEmbeddingsFile +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala index 9b26e563e6ccbe..a8c9928bae5ad6 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala @@ -1,7 +1,6 @@ package com.johnsnowlabs.nlp.annotators.assertion -import com.johnsnowlabs.nlp.AnnotatorType -import com.johnsnowlabs.nlp.annotators.assertion.logreg.{AssertionLogRegApproach, Windowing} +import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing import org.scalatest.{FlatSpec, Matchers} /** @@ -20,7 +19,6 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val result = applyWindow(doc, target) val expected = Array("empty_marker", "empty_marker", "empty_marker", "empty_marker", "the", "cat", "eats", "fish", "empty_marker", "empty_marker", "empty_marker") - assert(expected === result) } @@ -56,5 +54,4 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val result = applyWindow(doc, target) assert(expected === result) } - } From 8754a9567ad81c562aedc43ea2007315bff50a79 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 30 Nov 2017 10:34:27 -0300 Subject: [PATCH 04/55] some cleanup --- .../logreg/AssertionLogRegApproach.scala | 1 - .../assertion/logreg/Windowing.scala | 24 +++----------- .../ml/logreg/NegexDatasetReader.scala | 33 ++++++++----------- 3 files changed, 18 insertions(+), 40 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 19c953cc2a2c3c..43d4fced313cea 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -47,5 +47,4 @@ class AssertionLogRegApproach extends AnnotatorApproach[AssertionLogRegModel] AssertionLogRegModel(lr) } - override val embeddingsPath: String = ??? } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index ce3977de1cb46b..b044fb9c83f849 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -9,12 +9,10 @@ import org.apache.spark.ml.linalg.{Vector, Vectors} */ trait Windowing { - val before : Int = 6 - val after : Int = 18 - val embeddingsPath: String = "override me" + val before : Int + val after : Int - // hard-coded stuff - lazy val wordVectors: Option[WordEmbeddings] = None //= Some(WordEmbeddings(embeddingsPath, 200)) + lazy val wordVectors: Option[WordEmbeddings] = None /* apply window, pad/truncate sentence according to window */ @@ -49,21 +47,9 @@ trait Windowing { /* same as above, but convert the resulting text in a vector */ def applyWindowUdf = udf {(doc:String, target:String) => - val tmp : Array[Double] = applyWindow(doc.toLowerCase, target.toLowerCase).flatMap(wordVectors.get.getEmbeddings).map(_.toDouble) - - /* TODO tmp sanity check - remove */ - if (tmp.length != (before + after + 1) * 200) - println(doc) - - if (tmp.contains(Double.NaN)) - println(doc) - + val tmp = applyWindow(doc.toLowerCase, target.toLowerCase). + flatMap(wordVectors.get.getEmbeddings).map(_.toDouble) Vectors.dense(tmp) - } - /* Column label must be of type NumericType but was actually of type StringType. */ - def labelToNumber() = udf { label:String => - if (label.equals("Affirmed")) 1.0 else 0.0 - } } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala index 887b319670526f..616ceedc8647b5 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala @@ -5,6 +5,7 @@ import java.io.File import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} import org.apache.spark.sql._ +import org.apache.spark.sql.functions.udf /** * Reader for this dataset, @@ -13,27 +14,18 @@ import org.apache.spark.sql._ class NegexDatasetReader(wordEmbeddingsFile: String, wordEmbeddingsNDims: Int) extends Serializable with Windowing{ var fileDb = wordEmbeddingsFile + ".db" - WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) - - //private val mappings = Map("Affirmed" -> 0.0, "Negated" -> 1.0,"Historical" -> 2.0, "Family" -> 3.0) - + private val mappings = Map("Affirmed" -> 0.0, "Negated" -> 1.0,"Historical" -> 2.0, "Family" -> 3.0) override val (before, after) = (5, 8) /* TODO duplicated logic, consider relocation to common place */ - override lazy val wordVectors : Option[WordEmbeddings] = - if (wordEmbeddingsFile != null) { - require(new File(wordEmbeddingsFile).exists()) - val fileDb = wordEmbeddingsFile + ".db" - if (!new File(fileDb).exists()) - WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) - - if (new File(fileDb).exists()) - Some(WordEmbeddings(fileDb, wordEmbeddingsNDims)) - else - None - } - else - None + override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { + wordEmbeddingsFile => + require(new File(wordEmbeddingsFile).exists()) + val fileDb = wordEmbeddingsFile + ".db" + if (!new File(fileDb).exists()) + WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) + }.filter(_ => new File(fileDb).exists()) + .map(_ => WordEmbeddings(fileDb, wordEmbeddingsNDims)) def readNegexDataset(datasetPath: String)(implicit session:SparkSession) = { import session.implicits._ @@ -45,8 +37,9 @@ class NegexDatasetReader(wordEmbeddingsFile: String, wordEmbeddingsNDims: Int) e /* apply UDF to fix the length of each document */ dataset.select(applyWindowUdf($"sentence", $"target") - .as("features"), labelToNumber()($"label").as("label")) + .as("features"), labelToNumber($"label").as("label")) } - override val embeddingsPath: String = wordEmbeddingsFile + def labelToNumber = udf { label:String => mappings.get(label)} + } From 2601e8b8b34937ea3b2b09be46488d25a2a12217 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 30 Nov 2017 10:42:37 -0300 Subject: [PATCH 05/55] cleanup --- .../com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala index c26e2752113f02..4d5b33cdff9c94 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala @@ -1,7 +1,5 @@ package com.johnsnowlabs.ml.logreg -import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing -import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.sql.{DataFrame, SparkSession} @@ -10,7 +8,6 @@ import org.apache.spark.sql.{DataFrame, SparkSession} */ object NegexDatasetLogRegTest extends App { - /* local Spark for test */ implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() val datasetPath = "rsAnnotations-1-120-random.txt.csv" @@ -28,7 +25,6 @@ object NegexDatasetLogRegTest extends App { .setRegParam(0.01) .setElasticNetParam(0.8) lr.fit(ds) - } val model = train(datasetPath) From dc3f4c453c3844a83375a6d8b05a7acf9b68de8d Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 30 Nov 2017 16:22:37 -0300 Subject: [PATCH 06/55] i2b2 reader --- .../ml/logreg/I2b2DatasetLogRegTest.scala | 13 ++++ .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 73 +++++++++++++++++++ .../ml/logreg/NegexDatasetLogRegTest.scala | 15 ++-- 3 files changed, 94 insertions(+), 7 deletions(-) create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala new file mode 100644 index 00000000000000..4049f4c97ff961 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -0,0 +1,13 @@ +package com.johnsnowlabs.ml.logreg + +import org.apache.spark.sql.SparkSession + +object I2b2DatasetLogRegTest extends App { + + implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() + val datasetPath = "/home/jose/Downloads/concept_assertion_relation_training_data/beth" + val reader = new I2b2DatasetReader(datasetPath) + + reader.readDataset.printSchema() + +} diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala new file mode 100644 index 00000000000000..77d27ed9ab6933 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -0,0 +1,73 @@ +package com.johnsnowlabs.ml.logreg + +import java.io.File +import org.apache.spark.sql.{DataFrame, SparkSession} +import scala.io.Source + + +class I2b2DatasetReader(datasetPath: String) { + + def readDataset(implicit session: SparkSession): DataFrame = { + import session.implicits._ + // read list of ast files, without extension + val astFileNames = { + val ast = new File(s"$datasetPath/ast/") + if (ast.exists && ast.isDirectory) + ast.listFiles.filter(_.isFile).toList.map(_.getName.dropRight(4)) + else + List[String]() + } + + // extract datapoints from each file + val dataset = + for {name <- astFileNames + annotation <- Source.fromFile(s"$datasetPath/ast/$name.ast").getLines() + sourceTxt = Source.fromFile(s"$datasetPath/txt/$name.txt").getLines().toList + } yield { + val record = I2b2Annotation(annotation) + val text = sourceTxt(record.sourceLine - 1) + I2b2AnnotationAndText(text, record.target, record.label, record.start, record.end) + } + dataset.toDF() + } +} +case class I2b2Annotation(target: String, label: String, start:Int, end:Int, sourceLine:Int) +case class I2b2AnnotationAndText(text: String, target: String, label: String, start:Int, end:Int) + +object I2b2Annotation { + + private def extractTarget(text:String): String = { + val pattern = "c=\"(.*)\"".r + pattern.findFirstMatchIn(text).map(_.group(1)). + getOrElse(throw new RuntimeException("Broken dataset - bad target")) + } + + private def extractSourceLine(text: String): Int = { + val pattern = "(\\d+):\\d+".r + pattern.findFirstMatchIn(text).map(_.group(1)). + getOrElse(throw new RuntimeException("Broken dataset - bad source line")).toInt + } + + def extractLimits(text: String): (Int, Int) = { + val pattern = "\\d+:(\\d+)".r + pattern.findAllMatchIn(text).map(_.group(1)).toList match { + case start::end::Nil => (start.toInt, end.toInt) + case _ => throw new RuntimeException("Broken dataset - bad start and end") + } + } + + def extractLabel(text: String) = { + val pattern = "a=\"(.*)\"".r + pattern.findFirstMatchIn(text).map(_.group(1)). + getOrElse(throw new RuntimeException("Broken dataset - bad source line")) + } + + def apply(annotation: String): I2b2Annotation = { + val chunks = annotation.split("\\|\\|") + val target = extractTarget(chunks(0)) + val sourceLine = extractSourceLine(chunks(0)) + val (start, end) = extractLimits(chunks(0)) + val label = extractLabel(chunks(2)) + I2b2Annotation(target, label, start, end, sourceLine) + } +} diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala index 4d5b33cdff9c94..e469ef75700758 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala @@ -16,22 +16,23 @@ object NegexDatasetLogRegTest extends App { val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" val reader = new NegexDatasetReader(embeddingsFile, embeddingsDims) - def train(datasetPath: String) = { + def train(dataFrame: DataFrame) = { import spark.implicits._ - val ds = reader.readNegexDataset(datasetPath) - val lr = new LogisticRegression() .setMaxIter(8) .setRegParam(0.01) .setElasticNetParam(0.8) - lr.fit(ds) + lr.fit(dataFrame) } - val model = train(datasetPath) - // test on train data, just as a 'smoke test' val ds = reader.readNegexDataset(datasetPath) - val result = model.transform(ds) + + // Split the data into training and test sets (30% held out for testing). + val Array(trainingData, testData) = ds.randomSplit(Array(0.7, 0.3)) + val model = train(trainingData) + + val result = model.transform(testData) val total = result.count val correct = result.filter(r => r.getAs[Double]("prediction") == r.getAs[Double]("label")).count From fdca19b3987321215ae32eba8dc1be6747ca290c Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 4 Dec 2017 11:27:20 -0300 Subject: [PATCH 07/55] fixed problem with windows --- .../nlp/annotators/assertion/logreg/Windowing.scala | 6 +++--- .../johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index b044fb9c83f849..4200b4d7b58a0d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -14,16 +14,16 @@ trait Windowing { lazy val wordVectors: Option[WordEmbeddings] = None - /* apply window, pad/truncate sentence according to window */ def applyWindow(doc: String, target:String) : Array[String] = { + println(target) val sentSplits = doc.split(target).map(_.trim) val targetPart = target.split(" ") - val leftPart = if (sentSplits.head.isEmpty) Array[String]() + val leftPart = if (sentSplits.headOption.isEmpty || sentSplits.head.isEmpty) Array[String]() else sentSplits.head.split(" ") - val rightPart = if (sentSplits.length == 1) Array[String]() + val rightPart = if (sentSplits.length == 1 || sentSplits.lastOption.isEmpty) Array[String]() else sentSplits.last.split(" ") val (start, leftPadding) = diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala new file mode 100644 index 00000000000000..edc3d9afbd372f --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -0,0 +1,5 @@ +package com.johnsnowlabs.ml.logreg + +class I2b2DatasetPipelineTest { + +} From 9a06fbc79cebaa5901996b99000e5d671db4fdd6 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 4 Dec 2017 11:29:23 -0300 Subject: [PATCH 08/55] enhancements in i2b2 reader --- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 87 +++++++++++++++---- 1 file changed, 69 insertions(+), 18 deletions(-) diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index 77d27ed9ab6933..e1306118a4c40d 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -1,17 +1,34 @@ package com.johnsnowlabs.ml.logreg import java.io.File + +import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing +import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} +import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, SparkSession} + import scala.io.Source +/* +* datasetPath: a list of datasets, for example the 'beth' or 'partner' directories (each containing +* an ast and txt folder). +* +* */ -class I2b2DatasetReader(datasetPath: String) { +class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Windowing { + + override val (before, after) = (10, 10) + var fileDb = wordEmbeddingsFile + ".db" + + + /* receives the location of a single dataset (e.g. 'beth'), + * and returns a sequence of datapoins I2b2AnnotationAndText + * */ + def read(path: String): Seq[I2b2AnnotationAndText] = { - def readDataset(implicit session: SparkSession): DataFrame = { - import session.implicits._ // read list of ast files, without extension val astFileNames = { - val ast = new File(s"$datasetPath/ast/") + val ast = new File(s"$path/ast/") if (ast.exists && ast.isDirectory) ast.listFiles.filter(_.isFile).toList.map(_.getName.dropRight(4)) else @@ -19,17 +36,42 @@ class I2b2DatasetReader(datasetPath: String) { } // extract datapoints from each file - val dataset = - for {name <- astFileNames - annotation <- Source.fromFile(s"$datasetPath/ast/$name.ast").getLines() - sourceTxt = Source.fromFile(s"$datasetPath/txt/$name.txt").getLines().toList - } yield { - val record = I2b2Annotation(annotation) - val text = sourceTxt(record.sourceLine - 1) - I2b2AnnotationAndText(text, record.target, record.label, record.start, record.end) - } - dataset.toDF() + for {name <- astFileNames + annotation <- Source.fromFile(s"$path/ast/$name.ast").getLines() + sourceTxt = Source.fromFile(s"$path/txt/$name.txt").getLines().toList + } yield { + val record = I2b2Annotation(annotation) + val text = sourceTxt(record.sourceLine - 1) + I2b2AnnotationAndText(text, record.target, record.label, record.start, record.end) + } + } + + /* reads the all the locations for all datasets (e.g. ['beth', 'partners']), + * and returns a Spark DataFrame + * */ + def readDataFrame(datasetPaths: Seq[String]) (implicit session: SparkSession): DataFrame= { + import session.implicits._ + datasetPaths.flatMap(read).toDF + .select(applyWindowUdf($"text", $"target") + .as("features"), labelToNumber($"label").as("label")) } + + private val mappings = Map("hypothetical" -> 0.0, + "present" -> 1.0, "absent" -> 2.0, "possible" -> 3.0, + "conditional"-> 4.0, "associated_with_someone_else" -> 5.0) + + /* TODO duplicated logic, consider relocation to common place */ + override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { + wordEmbeddingsFile => + require(new File(wordEmbeddingsFile).exists()) + val fileDb = wordEmbeddingsFile + ".db" + if (!new File(fileDb).exists()) + WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) + }.filter(_ => new File(fileDb).exists()) + .map(_ => WordEmbeddings(fileDb, 200)) + + def labelToNumber = udf { label:String => mappings.get(label)} + } case class I2b2Annotation(target: String, label: String, start:Int, end:Int, sourceLine:Int) case class I2b2AnnotationAndText(text: String, target: String, label: String, start:Int, end:Int) @@ -49,11 +91,20 @@ object I2b2Annotation { } def extractLimits(text: String): (Int, Int) = { - val pattern = "\\d+:(\\d+)".r - pattern.findAllMatchIn(text).map(_.group(1)).toList match { - case start::end::Nil => (start.toInt, end.toInt) - case _ => throw new RuntimeException("Broken dataset - bad start and end") + val startPattern = "\\d+:(\\d+)\\s\\d+:\\d+".r + val endPattern = "\\d+:\\d+\\s\\d+:(\\d+)".r + + + val start = startPattern.findAllMatchIn(text).map(_.group(1)).toList match { + case s::Nil => s.toInt + case _ => throw new RuntimeException("Broken dataset - bad start") + } + + val end = endPattern.findAllMatchIn(text).map(_.group(1)).toList match { + case e::Nil => e.toInt + case _ => throw new RuntimeException("Broken dataset - bad end") } + (start, end) } def extractLabel(text: String) = { From 4dd89f6c2b43164237cd908d227f5bdcb7c72164 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 4 Dec 2017 11:29:44 -0300 Subject: [PATCH 09/55] added case for windowing unit test --- .../annotators/assertion/SentenceWindowingTest.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala index a8c9928bae5ad6..1985721b6613b0 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala @@ -54,4 +54,15 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val result = applyWindow(doc, target) assert(expected === result) } + + "target occupies the whole text" should "be correctly chunked and padded" in new Scope { + val doc = "post-operative transient ischemic attack" + val target = "post-operative transient ischemic attack" + val expected = ("empty_marker empty_marker empty_marker empty_marker empty_marker " + + "post-operative transient ischemic attack empty_marker empty_marker").split(" ") + + val result = applyWindow(doc, target) + assert(expected === result) + } + } From 4b60b1b3c1c05531a08af604bf2b88764156fc9c Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 8 Dec 2017 00:04:00 -0300 Subject: [PATCH 10/55] work in progress assertion model in the pipeline --- .../com/johnsnowlabs/nlp/AnnotatorModel.scala | 2 +- .../logreg/AssertionLogRegApproach.scala | 61 +++++++++---- .../logreg/AssertionLogRegModel.scala | 31 +++++-- .../assertion/logreg/Windowing.scala | 66 +++++++++++--- .../nlp/annotators/common/Tagged.scala | 2 + .../AnnotatorWithWordEmbeddings.scala | 4 +- .../embeddings/ModelWithWordEmbeddings.scala | 1 + .../ml/logreg/I2b2DatasetLogRegTest.scala | 67 ++++++++++++-- .../ml/logreg/I2b2DatasetPipelineTest.scala | 89 ++++++++++++++++++- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 24 +++-- .../ml/logreg/NegexDatasetReader.scala | 3 +- 11 files changed, 297 insertions(+), 53 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 51b288451bf50c..1ab889583c4635 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -61,7 +61,7 @@ abstract class AnnotatorModel[M <: Model[M]] * @param dataset [[Dataset[Row]]] * @return */ - override final def transform(dataset: Dataset[_]): DataFrame = { + override def transform(dataset: Dataset[_]): DataFrame = { require(validate(dataset.schema), s"Missing annotators in pipeline. Make sure the following are present: " + s"${requiredAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 43d4fced313cea..a6743608037548 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -2,49 +2,78 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings -import com.johnsnowlabs.nlp.{AnnotatorApproach} +import com.johnsnowlabs.nlp.AnnotatorApproach +import com.johnsnowlabs.nlp.annotators.common.NerTagged.{getAnnotations, getLabels} +import com.johnsnowlabs.nlp.annotators.common.PosTagged import org.apache.spark.sql.functions.udf import org.apache.spark.ml.param.Param -import org.apache.spark.sql.Dataset +import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.util.Identifiable /** * Created by jose on 22/11/17. */ -class AssertionLogRegApproach extends AnnotatorApproach[AssertionLogRegModel] +class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproach[AssertionLogRegModel] with AnnotatorWithWordEmbeddings with Windowing { - override val requiredAnnotatorTypes = Array(TOKEN) + override val requiredAnnotatorTypes = Array(DOCUMENT, POS) override val description: String = "Clinical Text Status Assertion" - override val annotatorType: AnnotatorType = null - override val uid: String = "" + override val annotatorType: AnnotatorType = ASSERTION + def this() = this(Identifiable.randomUID("ASSERTION")) + override lazy val localPath = "/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin.db" // example of possible values, 'Negated', 'Affirmed', 'Historical' - val labelColumn = new Param[String](this, "labelColumn", "Column with one label per document") + //val labelColumn = new Param[String](this, "label", "Column with one label per document") // the document where we're extracting the assertion - val documentColumn = new Param[String](this, "documentColumn", "Column with one label per document") + //val documentColumn = new Param[String](this, "documentColumn", "Column with one label per document") // the target term, that must appear capitalized in the document, e.g., 'diabetes' - val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") + //val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") - override val (before, after) = (10, 5) + override val (before, after) = (10, 14) + var tag2Vec : Map[String, Array[Double]] = Map() override def train(dataset: Dataset[_]): AssertionLogRegModel = { import dataset.sqlContext.implicits._ + /* read the set of all tags */ + //val tagSet = inferTagSet(dataset.toDF) + //dataset.collect() + + /* assign each tag an array of 3 floats */ + //tag2Vec = encode(tagSet) + /* apply UDF to fix the length of each document */ - val processed = dataset.toDF.select(applyWindowUdf($"documentColumn", $"targetColumn") - .as("window"), $"labelColumn".as("label")) + val processed = dataset.toDF. + withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target")).cache() + //.select($"features", $"label") /* TODO: pick the parameters you want to expose*/ val lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) + .setMaxIter(20) + .setRegParam(0.002) .setElasticNetParam(0.8) - lr.fit(processed) - AssertionLogRegModel(lr) + + fillModelEmbeddings(AssertionLogRegModel(lr.fit(processed), tag2Vec)) } + + def inferTagSet(dataset: Dataset[Row]): Array[String] = + dataset.select("pos") + .collect() + .flatMap { row => + row.getAs[Seq[Row]](0).map(_.getString(3)).distinct + }.distinct + + + def encode(tagSet: Array[String]) : Map[String, Array[Double]]= { + val values = Array(.25, .50, .75, 1) + val codes = for (a <- values; + b <- values; + c <- values) yield Array(a, b, c) + tagSet.sorted.zip(codes).toMap + } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index bf861818ccdbc7..1506607ce1f140 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -1,13 +1,21 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg +import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT, POS} import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType} import com.johnsnowlabs.nlp.embeddings.ModelWithWordEmbeddings -import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, Dataset} /** * Created by jose on 22/11/17. */ -class AssertionLogRegModel(model:LogisticRegression) extends AnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings{ + +class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, Array[Double]], override val uid: String = Identifiable.randomUID("ASSERTIOM")) + extends AnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings with Windowing { + + // TODO this should come as a parameter + override val (before, after) = (10, 14) /** * takes a document and annotations and produces new annotations of this annotator's annotation type @@ -15,13 +23,24 @@ class AssertionLogRegModel(model:LogisticRegression) extends AnnotatorModel[Asse * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any * @return any number of annotations processed for every input annotation. Not necessary one to one relationship */ - override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = ??? + override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations + //def this() = this(Identifiable.randomUID("ASSERTION")) + override val annotatorType: AnnotatorType = AnnotatorType.ASSERTION - override val requiredAnnotatorTypes: Array[String] = null - override val uid: String = "" + override val requiredAnnotatorTypes = Array(DOCUMENT, POS) + override final def transform(dataset: Dataset[_]): DataFrame = { + import dataset.sqlContext.implicits._ + + /* apply UDF to fix the length of each document */ + val processed = dataset.toDF. + withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target")).cache() //, $"pos", $"start", $"end" + //.select($"features", $"label") + + super.transform(model.transform(processed)) + } } object AssertionLogRegModel { - def apply(model: LogisticRegression): AssertionLogRegModel = new AssertionLogRegModel(model) + def apply(model: LogisticRegressionModel, tag2Vec: Map[String, Array[Double]]): AssertionLogRegModel = new AssertionLogRegModel(model, tag2Vec) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 4200b4d7b58a0d..932c7e1ba47ded 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -3,11 +3,15 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} import org.apache.spark.sql.functions._ import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema + +import scala.collection.mutable +import scala.util.Try /** * Created by jose on 24/11/17. */ -trait Windowing { +trait Windowing extends Serializable{ val before : Int val after : Int @@ -15,16 +19,17 @@ trait Windowing { lazy val wordVectors: Option[WordEmbeddings] = None /* apply window, pad/truncate sentence according to window */ - def applyWindow(doc: String, target:String) : Array[String] = { - println(target) - val sentSplits = doc.split(target).map(_.trim) - val targetPart = target.split(" ") + def applyWindow(doc: String, s: Int, e: Int) : Array[String] = { + + val target = doc.slice(s, e) + val targetPart = tokenize(target.trim) + //println(target) - val leftPart = if (sentSplits.headOption.isEmpty || sentSplits.head.isEmpty) Array[String]() - else sentSplits.head.split(" ") + val leftPart = if (s == 0) Array[String]() + else tokenize(doc.slice(0, s).trim) //TODO add proper tokenizer here - val rightPart = if (sentSplits.length == 1 || sentSplits.lastOption.isEmpty) Array[String]() - else sentSplits.last.split(" ") + val rightPart = if (e == doc.length) Array[String]() + else tokenize(doc.slice(e, doc.length).trim) val (start, leftPadding) = if(leftPart.size >= before) @@ -44,12 +49,51 @@ trait Windowing { leftPadding ++ leftContext ++ targetPart ++ rightContext ++ rightPadding } + /* apply window, pad/truncate sentence according to window */ + def applyWindow(doc: String, target: String) : Array[String] = { + val start = doc.indexOf(target) + val end = start + target.length + applyWindow(doc, start, end) + } + + /* same as above, but convert the resulting text in a vector */ + def applyWindowUdf(wvectors: WordEmbeddings, codes: Map[String, Array[Double]]) = + udf {(doc:String, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, target:String) => + val tmp = applyWindow(doc.toLowerCase, target.toLowerCase). + flatMap(wvectors.getEmbeddings).map(_.toDouble) + + val empty = Array.fill(3)(0.0) + val previous = if (start < 3) empty ++ empty ++ empty + else pos.toArray.slice(start - 3, start).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) + + val result = if (previous.length == 9) tmp ++ previous else tmp ++ empty ++ empty ++ empty + //if(result.length != 4009) + //println(tmp.length, previous.length) + Vectors.dense(result) + } + /* same as above, but convert the resulting text in a vector */ - def applyWindowUdf = + def applyWindowUdf(wvectors: WordEmbeddings) = udf {(doc:String, target:String) => val tmp = applyWindow(doc.toLowerCase, target.toLowerCase). - flatMap(wordVectors.get.getEmbeddings).map(_.toDouble) + flatMap(wvectors.getEmbeddings).map(_.toDouble) Vectors.dense(tmp) } + /* appends POS tags at the end of the vector */ + def appendPos(codes: Map[String, Array[Double]]) = + udf {(vector:Vector, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, target:String) => + val empty = Array.fill(9)(0.0) + val previous = if (start < 3) empty + else pos.toArray.slice(start - 3, start).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) + + Vectors.dense(vector.toArray ++ empty) + } + + val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", + "}", "#", "/", "\\", "\"", "\'", "[", "]", "%", "<", ">", "&", "=") + + /* Tokenize a sentence taking care of punctuation */ + def tokenize(sentence: String) : Array[String] = sentence.split(" ") + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala index ae94b3af7abb77..c63c0e1f82b3d8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala @@ -105,4 +105,6 @@ object NerTagged extends Tagged[NerTaggedSentence]{ labels.zip(sentences) } } + + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala index 755c3558821734..cdeab880ea2ae4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala @@ -41,7 +41,7 @@ trait AnnotatorWithWordEmbeddings extends AutoCloseable { this: Estimator[_] => val file = "/" + new Path(localPath).getName val path = Path.mergePaths(new Path($(embeddingsFolder)), new Path(file)) - hdfs.copyFromLocalFile(new Path(localPath), path) + //hdfs.copyFromLocalFile(new Path(localPath), path) model.setDims($(embeddingsNDims)) @@ -54,7 +54,7 @@ trait AnnotatorWithWordEmbeddings extends AutoCloseable { this: Estimator[_] => get(sourceEmbeddingsPath).map(_ => WordEmbeddings(localPath, $(embeddingsNDims))) } - private lazy val localPath: String = { + lazy val localPath: String = { val path = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_idx") .toAbsolutePath.toString diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala index fbbd5d3cea9dab..45b6124354cfcc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala @@ -52,6 +52,7 @@ trait ModelWithWordEmbeddings extends AutoCloseable { localFile } + @transient lazy val embeddings: Option[WordEmbeddings] = { get(indexPath).map { path => WordEmbeddings(embeddingsFile, $(nDims)) diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index 4049f4c97ff961..e39bd40543b5a8 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -1,13 +1,68 @@ + package com.johnsnowlabs.ml.logreg -import org.apache.spark.sql.SparkSession +import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.sql.{DataFrame, SparkSession} + +object I2b2DatasetLogRegTest extends App with Windowing { + + override val before = 10 + override val after = 14 + + implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[2]").getOrCreate() + + val trainDatasetPath = Seq("/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/partners", + "/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/beth") + + val testDatasetPath = Seq("/home/jose/Downloads/i2b2/test_data") + + val embeddingsDims = 200 + val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val reader = new I2b2DatasetReader(embeddingsFile) + + import spark.implicits._ + val trainDataset = reader.readDataFrame(trainDatasetPath). + withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target")) + .select($"features", $"label") + + println("trainDsSize: " + trainDataset.count) + val testDataset = reader.readDataFrame(testDatasetPath). + withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target")) + .select($"features", $"label") + + println("testDsSize: " + testDataset.count) + + val model = train(trainDataset) + val result = model.transform(testDataset) + + import spark.implicits._ + case class TpFnFp(tp: Int, fn: Int, fp: Int) + val tpTnFp = result.map ({ r => + if (r.getAs[Double]("prediction") == r.getAs[Double]("label")) TpFnFp(1, 0, 0) + else TpFnFp(0, 1, 1) + }).collect().reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) + + println(calcStat(tpTnFp.tp + tpTnFp.fn, tpTnFp.tp + tpTnFp.fp, tpTnFp.tp)) -object I2b2DatasetLogRegTest extends App { + def train(dataFrame: DataFrame) = { + import spark.implicits._ + val lr = new LogisticRegression() + .setMaxIter(20) + .setRegParam(0.002) + .setElasticNetParam(0.8) + lr.fit(dataFrame) + } - implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() - val datasetPath = "/home/jose/Downloads/concept_assertion_relation_training_data/beth" - val reader = new I2b2DatasetReader(datasetPath) + /* TODO put in a common place */ + def calcStat(correct: Long, predicted: Long, predictedCorrect: Long): (Float, Float, Float) = { + // prec = (predicted & correct) / predicted + // rec = (predicted & correct) / correct + val prec = predictedCorrect.toFloat / predicted + val rec = predictedCorrect.toFloat / correct + val f1 = 2 * prec * rec / (prec + rec) + (prec, rec, f1) + } - reader.readDataset.printSchema() } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index edc3d9afbd372f..197709d4105ccf 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -1,5 +1,92 @@ package com.johnsnowlabs.ml.logreg -class I2b2DatasetPipelineTest { +import com.johnsnowlabs.ml.logreg.I2b2DatasetLogRegTest.calcStat +import com.johnsnowlabs.nlp.DocumentAssembler +import com.johnsnowlabs.nlp.annotators.RegexTokenizer +import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach +import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel +import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat +import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} +import org.apache.spark.sql.SparkSession + +object I2b2DatasetPipelineTest extends App { + + implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[4]") + .config("spark.executor.memory", "2g").getOrCreate + + import spark.implicits._ + val trainPaths = Seq("/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/partners" + ,"/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/beth") + val testPaths = Seq("/home/jose/Downloads/i2b2/test_data") + + val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + + def getAssertionStages(): Array[_ <: PipelineStage] = { + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val tokenizer = new RegexTokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val posTagger = new PerceptronApproach() + .setCorpusPath("/anc-pos-corpus/") + .setNIterations(10) + .setInputCols("token", "document") + .setOutputCol("pos") + + val assertionStatus = new AssertionLogRegApproach() + .setInputCols("document", "pos") + .setOutputCol("assertion") + .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary) + .setEmbeddingsFolder("/home/jose/Downloads/bio_nlp_vec") + + Array(documentAssembler, + tokenizer, + posTagger, + assertionStatus) + + } + + val reader = new I2b2DatasetReader(embeddingsFile) + + def trainAssertionModel(paths: Seq[String]): PipelineModel = { + System.out.println("Train Dataset Reading") + val time = System.nanoTime() + val dataset = reader.readDataFrame(paths) + System.out.println(s"Done, ${(System.nanoTime() - time)/1e9}\n") + System.out.println("Start fitting") + + // train Assertion Status + val pipeline = new Pipeline() + .setStages(getAssertionStages) + + pipeline.fit(dataset) + } + + def testAssertionModel(path:Seq[String], model: PipelineModel) = { + System.out.println("Test Dataset Reading") + val dataset = reader.readDataFrame(path) + model.transform(dataset) + } + + + + val model = trainAssertionModel(trainPaths) + val result = testAssertionModel(testPaths, model) + + /* TODO all this to common place */ + import spark.implicits._ + case class TpFnFp(tp: Int, fn: Int, fp: Int) + val tpFnFp = result.map ({ r => + if (r.getAs[Double]("prediction") == r.getAs[Double]("label")) TpFnFp(1, 0, 0) + else TpFnFp(0, 1, 1) + }).collect().reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) + + println(calcStat(tpFnFp.tp + tpFnFp.fn, tpFnFp.tp + tpFnFp.fp, tpFnFp.tp)) + } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index e1306118a4c40d..49dbed578309c7 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -17,7 +17,7 @@ import scala.io.Source class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Windowing { - override val (before, after) = (10, 10) + override val (before, after) = (8, 12) var fileDb = wordEmbeddingsFile + ".db" @@ -35,25 +35,34 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi List[String]() } + var tooLong = 0 + // extract datapoints from each file - for {name <- astFileNames + val datapoints = for {name <- astFileNames annotation <- Source.fromFile(s"$path/ast/$name.ast").getLines() sourceTxt = Source.fromFile(s"$path/txt/$name.txt").getLines().toList } yield { val record = I2b2Annotation(annotation) val text = sourceTxt(record.sourceLine - 1) - I2b2AnnotationAndText(text, record.target, record.label, record.start, record.end) + if(record.target.split(" ").length > 8){ + tooLong += 1 + null + } + else + I2b2AnnotationAndText(text, record.target, record.label, record.start, record.end) } + println("number of targets too long: " + tooLong) + datapoints } - /* reads the all the locations for all datasets (e.g. ['beth', 'partners']), + /* reads all the locations for all datasets (e.g. ['beth', 'partners']), * and returns a Spark DataFrame * */ def readDataFrame(datasetPaths: Seq[String]) (implicit session: SparkSession): DataFrame= { + //TODO: should windowing be here? import session.implicits._ - datasetPaths.flatMap(read).toDF - .select(applyWindowUdf($"text", $"target") - .as("features"), labelToNumber($"label").as("label")) + datasetPaths.flatMap(read).filter(_!=null).toDF.withColumn("label", labelToNumber($"label")) + } private val mappings = Map("hypothetical" -> 0.0, @@ -94,7 +103,6 @@ object I2b2Annotation { val startPattern = "\\d+:(\\d+)\\s\\d+:\\d+".r val endPattern = "\\d+:\\d+\\s\\d+:(\\d+)".r - val start = startPattern.findAllMatchIn(text).map(_.group(1)).toList match { case s::Nil => s.toInt case _ => throw new RuntimeException("Broken dataset - bad start") diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala index 616ceedc8647b5..89e24948655903 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala @@ -21,7 +21,6 @@ class NegexDatasetReader(wordEmbeddingsFile: String, wordEmbeddingsNDims: Int) e override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { wordEmbeddingsFile => require(new File(wordEmbeddingsFile).exists()) - val fileDb = wordEmbeddingsFile + ".db" if (!new File(fileDb).exists()) WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) }.filter(_ => new File(fileDb).exists()) @@ -36,7 +35,7 @@ class NegexDatasetReader(wordEmbeddingsFile: String, wordEmbeddingsNDims: Int) e load(datasetPath) /* apply UDF to fix the length of each document */ - dataset.select(applyWindowUdf($"sentence", $"target") + dataset.select(applyWindowUdf(null, null)($"sentence", $"target") .as("features"), labelToNumber($"label").as("label")) } From 427fdbdaf2be0f84677c346a3701bdc7853e4d64 Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 8 Dec 2017 12:36:36 -0300 Subject: [PATCH 11/55] work in progress --- .../logreg/AssertionLogRegApproach.scala | 20 +++++++++++-------- .../logreg/AssertionLogRegModel.scala | 4 ++-- .../assertion/logreg/Windowing.scala | 9 ++++++++- .../ml/logreg/I2b2DatasetLogRegTest.scala | 19 ++++++++++++++++++ .../ml/logreg/I2b2DatasetPipelineTest.scala | 15 +++++++++++--- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 2 +- 6 files changed, 54 insertions(+), 15 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index a6743608037548..a0a910833ceabf 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -32,28 +32,28 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac // the target term, that must appear capitalized in the document, e.g., 'diabetes' //val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") - override val (before, after) = (10, 14) + override val (before, after) = (10, 16) var tag2Vec : Map[String, Array[Double]] = Map() override def train(dataset: Dataset[_]): AssertionLogRegModel = { import dataset.sqlContext.implicits._ /* read the set of all tags */ - //val tagSet = inferTagSet(dataset.toDF) + val tagSet = inferTagSet(dataset.toDF) //dataset.collect() /* assign each tag an array of 3 floats */ - //tag2Vec = encode(tagSet) + tag2Vec = encode(tagSet) /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target")).cache() - //.select($"features", $"label") + withColumn("features", applyWindowUdf(embeddings.get, tag2Vec)($"text", $"pos", $"start", $"end", $"target")).cache() + /* TODO: pick the parameters you want to expose*/ val lr = new LogisticRegression() - .setMaxIter(20) - .setRegParam(0.002) + .setMaxIter(26) + .setRegParam(0.001) .setElasticNetParam(0.8) @@ -73,7 +73,11 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac val values = Array(.25, .50, .75, 1) val codes = for (a <- values; b <- values; - c <- values) yield Array(a, b, c) + c <- values) yield { + import math.sqrt + val norm = sqrt(a * a + b * b + c * c) + Array(a/norm, b/norm, c/norm) + } tagSet.sorted.zip(codes).toMap } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 1506607ce1f140..64691e63705bcf 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -15,7 +15,7 @@ class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, A extends AnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings with Windowing { // TODO this should come as a parameter - override val (before, after) = (10, 14) + override val (before, after) = (10, 16) /** * takes a document and annotations and produces new annotations of this annotator's annotation type @@ -34,7 +34,7 @@ class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, A /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target")).cache() //, $"pos", $"start", $"end" + withColumn("features", applyWindowUdf(embeddings.get, tag2Vec)($"text", $"pos", $"start", $"end", $"target")).cache() //, $"pos", $"start", $"end" //.select($"features", $"label") super.transform(model.transform(processed)) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 932c7e1ba47ded..037fa46c6d5422 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -66,10 +66,17 @@ trait Windowing extends Serializable{ val previous = if (start < 3) empty ++ empty ++ empty else pos.toArray.slice(start - 3, start).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) - val result = if (previous.length == 9) tmp ++ previous else tmp ++ empty ++ empty ++ empty + var result = if (previous.length == 9) tmp ++ previous else tmp ++ empty ++ empty ++ empty + + val following = if (end + 3 > pos.size) empty ++ empty ++ empty + else pos.toArray.slice(end, end + 3).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) + + result = if (following.length == 9) result ++ following else result ++ empty ++ empty ++ empty //if(result.length != 4009) //println(tmp.length, previous.length) Vectors.dense(result) + + } /* same as above, but convert the resulting text in a vector */ diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index e39bd40543b5a8..8b4c16c72a79dd 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -5,6 +5,8 @@ import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.sql.{DataFrame, SparkSession} + + object I2b2DatasetLogRegTest extends App with Windowing { override val before = 10 @@ -64,5 +66,22 @@ object I2b2DatasetLogRegTest extends App with Windowing { (prec, rec, f1) } + def confusionMatrix[T](predicted: Seq[T], gold: Seq[T]) = { + val labels = gold.distinct + import scala.collection.mutable.{Map => MutableMap} + val matrix : Map[T, MutableMap[T, Int]] = + labels.map(label => (label, MutableMap(labels.zip(Array.fill(labels.size)(0)): _*))).toMap + + predicted.zip(gold).foreach { case (p, g) => + matrix.get(p).get(g) += 1 + } + + /* sanity check */ + if(predicted.length ==matrix.map(map => map._2.values.sum).sum) + println("looks good") + + matrix + } + } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index 197709d4105ccf..ad6c82d81c223d 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.ml.logreg -import com.johnsnowlabs.ml.logreg.I2b2DatasetLogRegTest.calcStat +import com.johnsnowlabs.ml.logreg.I2b2DatasetLogRegTest.{calcStat, confusionMatrix} import com.johnsnowlabs.nlp.DocumentAssembler import com.johnsnowlabs.nlp.annotators.RegexTokenizer import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach @@ -73,8 +73,6 @@ object I2b2DatasetPipelineTest extends App { model.transform(dataset) } - - val model = trainAssertionModel(trainPaths) val result = testAssertionModel(testPaths, model) @@ -88,5 +86,16 @@ object I2b2DatasetPipelineTest extends App { println(calcStat(tpFnFp.tp + tpFnFp.fn, tpFnFp.tp + tpFnFp.fp, tpFnFp.tp)) + val pred = result.select($"prediction").collect.map{ r => + r.getAs[Double]("prediction") + } + + val gold = result.select($"label").collect.map{ r => + r.getAs[Double]("label") + } + + + println(confusionMatrix(pred, gold)) + } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index 49dbed578309c7..1d5f8084986a73 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -44,7 +44,7 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi } yield { val record = I2b2Annotation(annotation) val text = sourceTxt(record.sourceLine - 1) - if(record.target.split(" ").length > 8){ + if(record.target.split(" ").length > 10){ tooLong += 1 null } From 29319d2252eccff695ee192da74a1e371863c104 Mon Sep 17 00:00:00 2001 From: Alberto Date: Tue, 12 Dec 2017 16:48:44 -0300 Subject: [PATCH 12/55] modified tokenizer to match the one using in embeddings --- .../assertion/logreg/Windowing.scala | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 037fa46c6d5422..75c4532ed9a23f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -74,7 +74,7 @@ trait Windowing extends Serializable{ result = if (following.length == 9) result ++ following else result ++ empty ++ empty ++ empty //if(result.length != 4009) //println(tmp.length, previous.length) - Vectors.dense(result) + Vectors.dense(tmp) } @@ -97,10 +97,26 @@ trait Windowing extends Serializable{ Vectors.dense(vector.toArray ++ empty) } + /* these match the behavior we had when tokenizing sentences for word embeddings */ val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", - "}", "#", "/", "\\", "\"", "\'", "[", "]", "%", "<", ">", "&", "=") + "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") + + val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""" + val number_regex = """([0-9]{1,6})""" + /* Tokenize a sentence taking care of punctuation */ - def tokenize(sentence: String) : Array[String] = sentence.split(" ") + def tokenize(sent: String) : Array[String] = { + var tmp = sent + + // replace percentage + tmp = tmp.replaceAll(percent_regex, " percentnum ") + + // replace special characters + punctuation.foreach(c => tmp = tmp.replace(c, " " + c + " ")) + + // replace any number + tmp.replaceAll(number_regex, " digitnum ").split(" ").filter(_ != "") + } } From 01973f9f6b979b11baef13a1c332c538c9e8192f Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 15 Dec 2017 14:24:09 -0300 Subject: [PATCH 13/55] work in progress --- .../logreg/AssertionLogRegApproach.scala | 21 +- .../assertion/logreg/Windowing.scala | 204 ++++++++++++++---- .../nlp/embeddings/WordEmbeddings.scala | 2 + .../ml/logreg/I2b2DatasetLogRegTest.scala | 115 ++++++++-- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 11 +- 5 files changed, 277 insertions(+), 76 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index a0a910833ceabf..8c7fcb759fc33b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -3,13 +3,10 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings import com.johnsnowlabs.nlp.AnnotatorApproach -import com.johnsnowlabs.nlp.annotators.common.NerTagged.{getAnnotations, getLabels} -import com.johnsnowlabs.nlp.annotators.common.PosTagged -import org.apache.spark.sql.functions.udf -import org.apache.spark.ml.param.Param import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.param.Param /** * Created by jose on 22/11/17. @@ -24,13 +21,13 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac override lazy val localPath = "/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin.db" // example of possible values, 'Negated', 'Affirmed', 'Historical' - //val labelColumn = new Param[String](this, "label", "Column with one label per document") + val labelColumn = new Param[String](this, "label", "Column with one label per document") // the document where we're extracting the assertion - //val documentColumn = new Param[String](this, "documentColumn", "Column with one label per document") + val documentColumn = new Param[String](this, "documentColumn", "Column with one label per document") // the target term, that must appear capitalized in the document, e.g., 'diabetes' - //val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") + val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") override val (before, after) = (10, 16) var tag2Vec : Map[String, Array[Double]] = Map() @@ -38,12 +35,10 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac override def train(dataset: Dataset[_]): AssertionLogRegModel = { import dataset.sqlContext.implicits._ - /* read the set of all tags */ - val tagSet = inferTagSet(dataset.toDF) - //dataset.collect() - - /* assign each tag an array of 3 floats */ - tag2Vec = encode(tagSet) + // read the set of all tags + // val tagSet = inferTagSet(dataset.toDF) + // assign each tag an array of 3 floats + // tag2Vec = encode(tagSet) /* apply UDF to fix the length of each document */ val processed = dataset.toDF. diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 75c4532ed9a23f..6bcd18424e001e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -1,12 +1,11 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg -import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} +import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.sql.functions._ -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -import scala.collection.mutable -import scala.util.Try +import scala.collection.{mutable} /** * Created by jose on 24/11/17. @@ -18,13 +17,20 @@ trait Windowing extends Serializable{ lazy val wordVectors: Option[WordEmbeddings] = None + /* TODO: create a tokenizer class */ + /* these match the behavior we had when tokenizing sentences for word embeddings */ + val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", + "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") + + val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""" + val number_regex = """([0-9]{1,6})""" + + /* apply window, pad/truncate sentence according to window */ - def applyWindow(doc: String, s: Int, e: Int) : Array[String] = { + def applyWindow(doc: String, s: Int, e: Int): (Array[String], Array[String], Array[String]) = { val target = doc.slice(s, e) val targetPart = tokenize(target.trim) - //println(target) - val leftPart = if (s == 0) Array[String]() else tokenize(doc.slice(0, s).trim) //TODO add proper tokenizer here @@ -46,11 +52,11 @@ trait Windowing extends Serializable{ val leftContext = leftPart.slice(start, leftPart.length) val rightContext = rightPart.slice(0, end + 1) - leftPadding ++ leftContext ++ targetPart ++ rightContext ++ rightPadding + (leftPadding ++ leftContext, targetPart, rightContext ++ rightPadding) } /* apply window, pad/truncate sentence according to window */ - def applyWindow(doc: String, target: String) : Array[String] = { + def applyWindow(doc: String, target: String) : (Array[String], Array[String], Array[String])= { val start = doc.indexOf(target) val end = start + target.length applyWindow(doc, start, end) @@ -58,51 +64,157 @@ trait Windowing extends Serializable{ /* same as above, but convert the resulting text in a vector */ def applyWindowUdf(wvectors: WordEmbeddings, codes: Map[String, Array[Double]]) = - udf {(doc:String, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, target:String) => - val tmp = applyWindow(doc.toLowerCase, target.toLowerCase). - flatMap(wvectors.getEmbeddings).map(_.toDouble) + udf {(doc:String, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, targetTerm:String) => - val empty = Array.fill(3)(0.0) - val previous = if (start < 3) empty ++ empty ++ empty - else pos.toArray.slice(start - 3, start).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) + val (l, t, r) = applyWindow(doc.toLowerCase, targetTerm.toLowerCase) + val target = Array.fill(5)(0.2f) + val nonTarget = Array.fill(5)(0.0f) + val tmp = l.flatMap(w => wvectors.getEmbeddings(w) ++ nonTarget).map(_.toDouble) ++ + t.flatMap(w => wvectors.getEmbeddings(w) ++ target).map(_.toDouble) ++ + r.flatMap(w => wvectors.getEmbeddings(w) ++ nonTarget).map(_.toDouble) - var result = if (previous.length == 9) tmp ++ previous else tmp ++ empty ++ empty ++ empty - - val following = if (end + 3 > pos.size) empty ++ empty ++ empty - else pos.toArray.slice(end, end + 3).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) - - result = if (following.length == 9) result ++ following else result ++ empty ++ empty ++ empty - //if(result.length != 4009) - //println(tmp.length, previous.length) Vectors.dense(tmp) - } /* same as above, but convert the resulting text in a vector */ def applyWindowUdf(wvectors: WordEmbeddings) = - udf {(doc:String, target:String) => - val tmp = applyWindow(doc.toLowerCase, target.toLowerCase). - flatMap(wvectors.getEmbeddings).map(_.toDouble) - Vectors.dense(tmp) + //here s and e are token number for start and end of target when split on " " + udf {(doc:String, targetTerm:String, s:Int, e:Int) => + val tokens = doc.split(" ").filter(_!="") + + /* now start and end are indexes in the doc string */ + val start = tokens.slice(0, s).map(_.length).sum + + tokens.slice(0, s).size // account for spaces + val end = start + tokens.slice(s, e + 1).map(_.length).sum + + tokens.slice(s, e + 1).size // account for spaces + + val (l, t, r) = applyWindow(doc.toLowerCase, start, end) + + var target = Array(0.1, -0.1) + var nonTarget = Array(-0.1, 0.1) + + val vector : Array[Double] = l.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble)) ++ + t.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ++ normalize(target)) ++ + r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ++ normalize(nonTarget)) + + //++ computeLeftDistances(l.takeRight(2), wvectors) ++ computeRightDistances(r.take(2), wvectors) + if(l.isEmpty || t.isEmpty || r.isEmpty) + println(vector.sum) + + Vectors.dense(vector) } - /* appends POS tags at the end of the vector */ - def appendPos(codes: Map[String, Array[Double]]) = - udf {(vector:Vector, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, target:String) => - val empty = Array.fill(9)(0.0) - val previous = if (start < 3) empty - else pos.toArray.slice(start - 3, start).map(_.getString(3)).toArray.flatMap(tag => codes.get(tag).getOrElse(empty)) + val dictWords = Seq("suggest", "evidence", "investigate", "likely", "possibly", "unclear") + //"possible", "deny", "judge", "father", "history", "appear", "no") + //"non", "imaging", "consistent", "thought", "prevent", "element") + //"believe", "rule", "discard", "vs", "either", "may") + //,"associated", "causes", "leading", "before") + + var dictEmbeddings : Seq[Array[Float]] = Seq() + + import math._ + + def distance(xs: Array[Float], ys: Array[Float]):Double = { + sqrt((xs zip ys).map { case (x,y) => pow(y - x, 2) }.sum) + } + + def l2norm(xs: Array[Double]):Double = { + sqrt(xs.map{ x => pow(x, 2)}.sum) + } + + def normalize(vec: Array[Double]) : Array[Double] = { + val norm = l2norm(vec) + 1.0 + vec.map(element => element / norm) + } + + + /* original */ + def computeDictDistances(word: String, wvectors: WordEmbeddings) :Array[Double] = { + val embeddings = dictEmbeddings + val distances = embeddings.map(e => distance(e, wvectors.getEmbeddings(word))).toArray + val norm = l2norm(distances) + distances.map(d => d / norm) + } + + /* +conditional + val preComplex = Map("associated" -> List("with"), + "leading" -> List("to"), + "history" -> List("of")) - Vectors.dense(vector.toArray ++ empty) + val preSimple = Array("causes", "caused") + + val postComplex = Map("happens" -> List("when"), + "with" -> List("allergies", "stress", "emotional", "climbing"), + "on" -> List("climbing"), + "at" -> List("work", "home", "rest")) + */ + + + + val preComplex = Map("suspicious" -> List("for"), + "could" -> List("be"), "suggestive" -> List("of", "that"), + "imaging" -> List("for"), "question" -> List("of"), + "rule" -> List("out"), "evaluation" -> List("for"), + "attributable" -> List("to"), "consistent" -> List("with"), + "possibility" -> List("to", "of"), "in" -> List("case"), "for" -> List("presumed"), "with" ->List("possible")) + + val preSimple = Array("suggesting", "suggest", "possible", "presumed", "perhaps", "question", "investigate") + + val posSimple = Array("vs", "or", "occurred") + + val posComplex = Map("was" -> List("considered")) + + def computeLeftDistances(context: Array[String], wvectors: WordEmbeddings): + Array[Double] = { + //add distances to single word cues + val single : Array[Double] = preSimple map { cue => + if(context.size > 0) + distance(wvectors.getEmbeddings(cue), wvectors.getEmbeddings(context.takeRight(1).head)) + else + -0.5 // we don't know } - /* these match the behavior we had when tokenizing sentences for word embeddings */ - val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", - "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") + //add distances to complex word cues + val complex = for ((firstToken, secondTokens) <- preComplex; + secondToken <- secondTokens) yield { + if(context.size > 1) + distance(wvectors.getEmbeddings(firstToken), wvectors.getEmbeddings(context.takeRight(2).head)) + + distance(wvectors.getEmbeddings(secondToken), wvectors.getEmbeddings(context.takeRight(1).head)) + else + -1 // we don't know + } + val distances:Array[Double] = Array(single.min, complex.min) + normalize(distances.toArray) + } - val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""" - val number_regex = """([0-9]{1,6})""" + + def computeRightDistances(context: Array[String], wvectors: WordEmbeddings): + Array[Double] = { + + + //add distances to complex word cues + val complex = for ((firstToken, secondTokens) <- posComplex; + secondToken <- secondTokens) yield { + if(context.size > 1) + distance(wvectors.getEmbeddings(firstToken), wvectors.getEmbeddings(context.head)) + + distance(wvectors.getEmbeddings(secondToken), wvectors.getEmbeddings(context.tail.head)) + else + -1.0 // we don't know + } + + //add distances to single word cues + val single : Array[Double] = preSimple map { cue => + if(context.size > 0) + distance(wvectors.getEmbeddings(cue), wvectors.getEmbeddings(context.head)) + else + -0.5 // we don't know + } + + val distances = Array(single.min, complex.min) + normalize(distances) + } /* Tokenize a sentence taking care of punctuation */ @@ -110,13 +222,19 @@ trait Windowing extends Serializable{ var tmp = sent // replace percentage - tmp = tmp.replaceAll(percent_regex, " percentnum ") + //tmp = tmp.replaceAll(percent_regex, " percentnum ") // replace special characters - punctuation.foreach(c => tmp = tmp.replace(c, " " + c + " ")) + punctuation.foreach(c => tmp = tmp.replace(c, " " + c + " ")) + + tmp = tmp.replace(",", " ") + tmp = tmp.replace(""", "\"") + tmp = tmp.replace("'", " ' ") + // replace any number - tmp.replaceAll(number_regex, " digitnum ").split(" ").filter(_ != "") + // tmp.replaceAll(number_regex, " digitnum ").split(" ").filter(_ != "") + tmp.split(" ").map(_.trim).filter(_ != "") } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala index 4a869b3c659ddc..bdd6aa9aae9cd0 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala @@ -28,6 +28,8 @@ case class WordEmbeddings(dbFile: String, } def getEmbeddings(word: String): Array[Float] = { + if(word.contains(" ")) + println("ERROR") lru.getOrElseUpdate(word, getEmbeddingsFromDb(word)) } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index 8b4c16c72a79dd..692559ccbed45f 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -1,45 +1,77 @@ - package com.johnsnowlabs.ml.logreg + import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing -import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.classification.{GBTClassifier, LogisticRegression, MultilayerPerceptronClassifier} +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.{DataFrame, SparkSession} +//shitty spark! +import org.apache.spark.mllib.linalg.{Vectors => MlLibVectors} +import org.apache.spark.ml.linalg.{Vector => MlVector} +import org.apache.spark.mllib.linalg.{Vector => MlLibVector} object I2b2DatasetLogRegTest extends App with Windowing { - override val before = 10 - override val after = 14 + override val before = 12 + override val after = 12 implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[2]").getOrCreate() - val trainDatasetPath = Seq("/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/partners", - "/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/beth") + // directory of the i2b2 dataset + val i2b2Dir = "/home/jose/Downloads/i2b2" + + val trainDatasetPath = Seq(s"${i2b2Dir}/concept_assertion_relation_training_data/partners" + , s"${i2b2Dir}/concept_assertion_relation_training_data/beth") val testDatasetPath = Seq("/home/jose/Downloads/i2b2/test_data") val embeddingsDims = 200 val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + //val embeddingsFile = s"/home/jose/embeddings/pubmed_i2b2.bin" val reader = new I2b2DatasetReader(embeddingsFile) import spark.implicits._ - val trainDataset = reader.readDataFrame(trainDatasetPath). - withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target")) + val trainDataset = reader.readDataFrame(trainDatasetPath) + .withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) .select($"features", $"label") println("trainDsSize: " + trainDataset.count) val testDataset = reader.readDataFrame(testDatasetPath). - withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target")) - .select($"features", $"label") + withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) + .select($"features", $"label", $"text", $"target") println("testDsSize: " + testDataset.count) val model = train(trainDataset) - val result = model.transform(testDataset) - - import spark.implicits._ case class TpFnFp(tp: Int, fn: Int, fp: Int) + import org.apache.spark.mllib.util.MLUtils + + val test = testDataset + .rdd.map(r => LabeledPoint(r.getAs[Double]("label"), + r.getAs[MlLibVector]("features"))) + + // Compute raw scores on the test set. + val predictionAndLabels = test.map { case LabeledPoint(label, features) => + val prediction = model.predict(features) + (prediction, label) + } + + + val tpTnFp = predictionAndLabels.map ({ case (pred, label) => + if (pred == label) TpFnFp(1, 0, 0) + else TpFnFp(0, 1, 1) + }).collect().reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) + + println(calcStat(tpTnFp.tp + tpTnFp.fn, tpTnFp.tp + tpTnFp.fp, tpTnFp.tp)) + + + /* + val result = model.transform(testDataset.cache()) + val tpTnFp = result.map ({ r => if (r.getAs[Double]("prediction") == r.getAs[Double]("label")) TpFnFp(1, 0, 0) else TpFnFp(0, 1, 1) @@ -47,13 +79,62 @@ object I2b2DatasetLogRegTest extends App with Windowing { println(calcStat(tpTnFp.tp + tpTnFp.fn, tpTnFp.tp + tpTnFp.fp, tpTnFp.tp)) + val evaluator = new MulticlassClassificationEvaluator("f1").setMetricName("f1") + println("Test set f1 = " + evaluator.evaluate(result)) + + val badGuys = result.filter(r => r.getAs[Double]("prediction") != r.getAs[Double]("label")).collect() + println(badGuys) + + + val pred = result.select($"prediction").collect.map{ r => + r.getAs[Double]("prediction") + } + + val gold = result.select($"label").collect.map{ r => + r.getAs[Double]("label") + } + + + println(confusionMatrix(pred, gold)) */ + def train(dataFrame: DataFrame) = { +/* import spark.implicits._ val lr = new LogisticRegression() - .setMaxIter(20) - .setRegParam(0.002) - .setElasticNetParam(0.8) - lr.fit(dataFrame) + .setMaxIter(20) //20 + .setRegParam(0.00135) //0.0012 + .setElasticNetParam(0.8) //0.8 + .setTol(1.0) + .setStandardization(false) + + lr.fit(dataFrame) */ + + // Run training algorithm to build the model + val model = new LogisticRegressionWithLBFGS() + model.optimizer.setRegParam(0.0013) + model.setNumClasses(6) + + model.run(dataFrame.rdd + .map(r => LabeledPoint(r.getAs[Double]("label"), + r.getAs[MlLibVector]("features")))) + +/* + val layers = Array[Int](5630, 6) + + // 5078, 6 -> 0.8878302 + + // create the trainer and set its parameters + val trainer = new MultilayerPerceptronClassifier() + .setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100) // 30 + .setTol(1E-6) //1E-5 + + trainer.fit(dataFrame.cache()) +*/ + + } /* TODO put in a common place */ diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index 1d5f8084986a73..c5f976e3c05cfe 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -44,7 +44,7 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi } yield { val record = I2b2Annotation(annotation) val text = sourceTxt(record.sourceLine - 1) - if(record.target.split(" ").length > 10){ + if(record.target.split(" ").length > 8){ tooLong += 1 null } @@ -65,9 +65,14 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi } + /* private val mappings = Map("hypothetical" -> 0.0, "present" -> 1.0, "absent" -> 2.0, "possible" -> 3.0, - "conditional"-> 4.0, "associated_with_someone_else" -> 5.0) + "conditional"-> 4.0, "associated_with_someone_else" -> 5.0)*/ + + private val mappings = Map("hypothetical" -> 0, + "present" -> 1, "absent" -> 2, "possible" -> 3, + "conditional"-> 4, "associated_with_someone_else" -> 5) /* TODO duplicated logic, consider relocation to common place */ override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { @@ -94,7 +99,7 @@ object I2b2Annotation { } private def extractSourceLine(text: String): Int = { - val pattern = "(\\d+):\\d+".r + val pattern = "c=\".*\" (\\d+):\\d+".r pattern.findFirstMatchIn(text).map(_.group(1)). getOrElse(throw new RuntimeException("Broken dataset - bad source line")).toInt } From d7c29d80dd07ba1bf09c0b7abe85c1a0ad4c1ba4 Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 15 Dec 2017 18:39:00 -0300 Subject: [PATCH 14/55] work in progress --- .../logreg/AssertionLogRegApproach.scala | 12 +- .../logreg/AssertionLogRegModel.scala | 6 +- .../assertion/logreg/Windowing.scala | 143 +++--------------- .../ml/logreg/I2b2DatasetPipelineTest.scala | 5 +- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 7 +- 5 files changed, 31 insertions(+), 142 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 8c7fcb759fc33b..6c0d28c358fba0 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -14,7 +14,7 @@ import org.apache.spark.ml.param.Param class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproach[AssertionLogRegModel] with AnnotatorWithWordEmbeddings with Windowing { - override val requiredAnnotatorTypes = Array(DOCUMENT, POS) + override val requiredAnnotatorTypes = Array(DOCUMENT) //, POS override val description: String = "Clinical Text Status Assertion" override val annotatorType: AnnotatorType = ASSERTION def this() = this(Identifiable.randomUID("ASSERTION")) @@ -29,7 +29,7 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac // the target term, that must appear capitalized in the document, e.g., 'diabetes' val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") - override val (before, after) = (10, 16) + override val (before, after) = (11, 13) var tag2Vec : Map[String, Array[Double]] = Map() override def train(dataset: Dataset[_]): AssertionLogRegModel = { @@ -42,15 +42,15 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("features", applyWindowUdf(embeddings.get, tag2Vec)($"text", $"pos", $"start", $"end", $"target")).cache() + //withColumn("features", applyWindowUdf(embeddings.get, tag2Vec)($"text", $"pos", $"start", $"end", $"target")).cache() + withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target", $"start", $"end")).cache() /* TODO: pick the parameters you want to expose*/ val lr = new LogisticRegression() .setMaxIter(26) - .setRegParam(0.001) - .setElasticNetParam(0.8) - + .setRegParam(0.00184) + .setElasticNetParam(0.9) fillModelEmbeddings(AssertionLogRegModel(lr.fit(processed), tag2Vec)) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 64691e63705bcf..d8c994c2319282 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -15,7 +15,7 @@ class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, A extends AnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings with Windowing { // TODO this should come as a parameter - override val (before, after) = (10, 16) + override val (before, after) = (11, 13) /** * takes a document and annotations and produces new annotations of this annotator's annotation type @@ -28,13 +28,13 @@ class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, A override val annotatorType: AnnotatorType = AnnotatorType.ASSERTION - override val requiredAnnotatorTypes = Array(DOCUMENT, POS) + override val requiredAnnotatorTypes = Array(DOCUMENT) //, POS override final def transform(dataset: Dataset[_]): DataFrame = { import dataset.sqlContext.implicits._ /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("features", applyWindowUdf(embeddings.get, tag2Vec)($"text", $"pos", $"start", $"end", $"target")).cache() //, $"pos", $"start", $"end" + withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target", $"start", $"end")).cache() //, $"text", $"pos", $"start", $"end", $"target" //.select($"features", $"label") super.transform(model.transform(processed)) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 6bcd18424e001e..915f1ba5a947a0 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -4,8 +4,7 @@ import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.sql.functions._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema - -import scala.collection.{mutable} +import scala.collection.mutable /** * Created by jose on 24/11/17. @@ -67,16 +66,23 @@ trait Windowing extends Serializable{ udf {(doc:String, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, targetTerm:String) => val (l, t, r) = applyWindow(doc.toLowerCase, targetTerm.toLowerCase) - val target = Array.fill(5)(0.2f) - val nonTarget = Array.fill(5)(0.0f) - val tmp = l.flatMap(w => wvectors.getEmbeddings(w) ++ nonTarget).map(_.toDouble) ++ - t.flatMap(w => wvectors.getEmbeddings(w) ++ target).map(_.toDouble) ++ - r.flatMap(w => wvectors.getEmbeddings(w) ++ nonTarget).map(_.toDouble) + var target = Array(0.1, -0.1) + var nonTarget = Array(-0.1, 0.1) + val tmp = l.flatMap(w => wvectors.getEmbeddings(w)).map(_.toDouble) ++ + t.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) ++ + r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) Vectors.dense(tmp) } + import scala.util.Random + val random = new Random() + + var tt = Array(random.nextGaussian(), random.nextGaussian()) + var nt = Array(-random.nextGaussian(), -random.nextGaussian()) + + /* same as above, but convert the resulting text in a vector */ def applyWindowUdf(wvectors: WordEmbeddings) = //here s and e are token number for start and end of target when split on " " @@ -90,36 +96,19 @@ trait Windowing extends Serializable{ tokens.slice(s, e + 1).size // account for spaces val (l, t, r) = applyWindow(doc.toLowerCase, start, end) + val vector : Array[Double] = l.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble)) ++ + t.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) ++ + r.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) - var target = Array(0.1, -0.1) - var nonTarget = Array(-0.1, 0.1) - val vector : Array[Double] = l.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble)) ++ - t.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ++ normalize(target)) ++ - r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ++ normalize(nonTarget)) - //++ computeLeftDistances(l.takeRight(2), wvectors) ++ computeRightDistances(r.take(2), wvectors) - if(l.isEmpty || t.isEmpty || r.isEmpty) - println(vector.sum) Vectors.dense(vector) } - val dictWords = Seq("suggest", "evidence", "investigate", "likely", "possibly", "unclear") - //"possible", "deny", "judge", "father", "history", "appear", "no") - //"non", "imaging", "consistent", "thought", "prevent", "element") - //"believe", "rule", "discard", "vs", "either", "may") - //,"associated", "causes", "leading", "before") - - var dictEmbeddings : Seq[Array[Float]] = Seq() - - import math._ - - def distance(xs: Array[Float], ys: Array[Float]):Double = { - sqrt((xs zip ys).map { case (x,y) => pow(y - x, 2) }.sum) - } def l2norm(xs: Array[Double]):Double = { + import math._ sqrt(xs.map{ x => pow(x, 2)}.sum) } @@ -128,113 +117,15 @@ trait Windowing extends Serializable{ vec.map(element => element / norm) } - - /* original */ - def computeDictDistances(word: String, wvectors: WordEmbeddings) :Array[Double] = { - val embeddings = dictEmbeddings - val distances = embeddings.map(e => distance(e, wvectors.getEmbeddings(word))).toArray - val norm = l2norm(distances) - distances.map(d => d / norm) - } - - /* -conditional - val preComplex = Map("associated" -> List("with"), - "leading" -> List("to"), - "history" -> List("of")) - - val preSimple = Array("causes", "caused") - - val postComplex = Map("happens" -> List("when"), - "with" -> List("allergies", "stress", "emotional", "climbing"), - "on" -> List("climbing"), - "at" -> List("work", "home", "rest")) - */ - - - - val preComplex = Map("suspicious" -> List("for"), - "could" -> List("be"), "suggestive" -> List("of", "that"), - "imaging" -> List("for"), "question" -> List("of"), - "rule" -> List("out"), "evaluation" -> List("for"), - "attributable" -> List("to"), "consistent" -> List("with"), - "possibility" -> List("to", "of"), "in" -> List("case"), "for" -> List("presumed"), "with" ->List("possible")) - - val preSimple = Array("suggesting", "suggest", "possible", "presumed", "perhaps", "question", "investigate") - - val posSimple = Array("vs", "or", "occurred") - - val posComplex = Map("was" -> List("considered")) - - def computeLeftDistances(context: Array[String], wvectors: WordEmbeddings): - Array[Double] = { - //add distances to single word cues - val single : Array[Double] = preSimple map { cue => - if(context.size > 0) - distance(wvectors.getEmbeddings(cue), wvectors.getEmbeddings(context.takeRight(1).head)) - else - -0.5 // we don't know - } - - //add distances to complex word cues - val complex = for ((firstToken, secondTokens) <- preComplex; - secondToken <- secondTokens) yield { - if(context.size > 1) - distance(wvectors.getEmbeddings(firstToken), wvectors.getEmbeddings(context.takeRight(2).head)) + - distance(wvectors.getEmbeddings(secondToken), wvectors.getEmbeddings(context.takeRight(1).head)) - else - -1 // we don't know - } - val distances:Array[Double] = Array(single.min, complex.min) - normalize(distances.toArray) - } - - - def computeRightDistances(context: Array[String], wvectors: WordEmbeddings): - Array[Double] = { - - - //add distances to complex word cues - val complex = for ((firstToken, secondTokens) <- posComplex; - secondToken <- secondTokens) yield { - if(context.size > 1) - distance(wvectors.getEmbeddings(firstToken), wvectors.getEmbeddings(context.head)) + - distance(wvectors.getEmbeddings(secondToken), wvectors.getEmbeddings(context.tail.head)) - else - -1.0 // we don't know - } - - //add distances to single word cues - val single : Array[Double] = preSimple map { cue => - if(context.size > 0) - distance(wvectors.getEmbeddings(cue), wvectors.getEmbeddings(context.head)) - else - -0.5 // we don't know - } - - val distances = Array(single.min, complex.min) - normalize(distances) - } - - /* Tokenize a sentence taking care of punctuation */ def tokenize(sent: String) : Array[String] = { var tmp = sent - // replace percentage - //tmp = tmp.replaceAll(percent_regex, " percentnum ") - // replace special characters punctuation.foreach(c => tmp = tmp.replace(c, " " + c + " ")) - tmp = tmp.replace(",", " ") tmp = tmp.replace(""", "\"") tmp = tmp.replace("'", " ' ") - - - // replace any number - // tmp.replaceAll(number_regex, " digitnum ").split(" ").filter(_ != "") tmp.split(" ").map(_.trim).filter(_ != "") } - } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index ad6c82d81c223d..b28057d8e3eda6 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -39,14 +39,15 @@ object I2b2DatasetPipelineTest extends App { .setOutputCol("pos") val assertionStatus = new AssertionLogRegApproach() - .setInputCols("document", "pos") + .setInputCols("document") //, "pos" + .setOutputCol("assertion") .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary) .setEmbeddingsFolder("/home/jose/Downloads/bio_nlp_vec") Array(documentAssembler, tokenizer, - posTagger, + //posTagger, assertionStatus) } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index c5f976e3c05cfe..5f125c2ea4da12 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -65,14 +65,11 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi } - /* + private val mappings = Map("hypothetical" -> 0.0, "present" -> 1.0, "absent" -> 2.0, "possible" -> 3.0, - "conditional"-> 4.0, "associated_with_someone_else" -> 5.0)*/ + "conditional"-> 4.0, "associated_with_someone_else" -> 5.0) - private val mappings = Map("hypothetical" -> 0, - "present" -> 1, "absent" -> 2, "possible" -> 3, - "conditional"-> 4, "associated_with_someone_else" -> 5) /* TODO duplicated logic, consider relocation to common place */ override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { From 1e3b516eb4bd994527d14de4d9fe91b905935c3f Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 12:36:34 -0300 Subject: [PATCH 15/55] deleted negex dataset reader --- .../ml/logreg/NegexDatasetLogRegTest.scala | 41 ------------------- 1 file changed, 41 deletions(-) delete mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala deleted file mode 100644 index e469ef75700758..00000000000000 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala +++ /dev/null @@ -1,41 +0,0 @@ -package com.johnsnowlabs.ml.logreg - -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.sql.{DataFrame, SparkSession} - -/** - * Created by jose on 24/11/17. - */ -object NegexDatasetLogRegTest extends App { - - /* local Spark for test */ - implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() - val datasetPath = "rsAnnotations-1-120-random.txt.csv" - - val embeddingsDims = 200 - val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" - val reader = new NegexDatasetReader(embeddingsFile, embeddingsDims) - - def train(dataFrame: DataFrame) = { - import spark.implicits._ - val lr = new LogisticRegression() - .setMaxIter(8) - .setRegParam(0.01) - .setElasticNetParam(0.8) - lr.fit(dataFrame) - } - - // test on train data, just as a 'smoke test' - val ds = reader.readNegexDataset(datasetPath) - - // Split the data into training and test sets (30% held out for testing). - val Array(trainingData, testData) = ds.randomSplit(Array(0.7, 0.3)) - val model = train(trainingData) - - val result = model.transform(testData) - val total = result.count - val correct = result.filter(r => r.getAs[Double]("prediction") == r.getAs[Double]("label")).count - - println("Accuracy: " + correct.toDouble / total.toDouble) - println(s"Coefficients: ${model.coefficients} Intercept: ${model.intercept}") -} From e28b52ab8d6eae2700d9c74124855e068b060cef Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 12:46:04 -0300 Subject: [PATCH 16/55] deleted negex dataset reader --- .../ml/logreg/NegexDatasetReader.scala | 44 ------------------- 1 file changed, 44 deletions(-) delete mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala deleted file mode 100644 index 89e24948655903..00000000000000 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala +++ /dev/null @@ -1,44 +0,0 @@ -package com.johnsnowlabs.ml.logreg - -import java.io.File - -import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing -import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} -import org.apache.spark.sql._ -import org.apache.spark.sql.functions.udf - -/** - * Reader for this dataset, - * https://github.com/mongoose54/negex/blob/master/genConText/rsAnnotations-1-120-random.txt - */ -class NegexDatasetReader(wordEmbeddingsFile: String, wordEmbeddingsNDims: Int) extends Serializable with Windowing{ - - var fileDb = wordEmbeddingsFile + ".db" - private val mappings = Map("Affirmed" -> 0.0, "Negated" -> 1.0,"Historical" -> 2.0, "Family" -> 3.0) - override val (before, after) = (5, 8) - - /* TODO duplicated logic, consider relocation to common place */ - override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { - wordEmbeddingsFile => - require(new File(wordEmbeddingsFile).exists()) - if (!new File(fileDb).exists()) - WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) - }.filter(_ => new File(fileDb).exists()) - .map(_ => WordEmbeddings(fileDb, wordEmbeddingsNDims)) - - def readNegexDataset(datasetPath: String)(implicit session:SparkSession) = { - import session.implicits._ - - val dataset = session.read.format("com.databricks.spark.csv"). - option("delimiter", "\t"). - option("header", "true"). - load(datasetPath) - - /* apply UDF to fix the length of each document */ - dataset.select(applyWindowUdf(null, null)($"sentence", $"target") - .as("features"), labelToNumber($"label").as("label")) - } - - def labelToNumber = udf { label:String => mappings.get(label)} - -} From f807856b1d9e6c766ce8c2c2bfb36576d5bbc4a0 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 15:10:56 -0300 Subject: [PATCH 17/55] little cleanup in test --- .../assertion/logreg/Windowing.scala | 59 +++++----- .../ml/logreg/I2b2DatasetLogRegTest.scala | 108 ++++-------------- 2 files changed, 49 insertions(+), 118 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 915f1ba5a947a0..76992937d35b97 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -1,20 +1,24 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.embeddings.WordEmbeddings -import org.apache.spark.sql.functions._ import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.sql.functions._ import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import scala.util.Random + import scala.collection.mutable /** * Created by jose on 24/11/17. */ -trait Windowing extends Serializable{ +trait Windowing extends Serializable { + /* */ val before : Int val after : Int lazy val wordVectors: Option[WordEmbeddings] = None + val random = new Random() /* TODO: create a tokenizer class */ /* these match the behavior we had when tokenizing sentences for word embeddings */ @@ -31,7 +35,7 @@ trait Windowing extends Serializable{ val target = doc.slice(s, e) val targetPart = tokenize(target.trim) val leftPart = if (s == 0) Array[String]() - else tokenize(doc.slice(0, s).trim) //TODO add proper tokenizer here + else tokenize(doc.slice(0, s).trim) val rightPart = if (e == doc.length) Array[String]() else tokenize(doc.slice(e, doc.length).trim) @@ -61,49 +65,40 @@ trait Windowing extends Serializable{ applyWindow(doc, start, end) } - /* same as above, but convert the resulting text in a vector */ + + def applyWindow(wvectors: WordEmbeddings) (doc:String, targetTerm:String, s:Int, e:Int) : Array[Double] = { + val tokens = doc.split(" ").filter(_!="") + + /* now start and end are indexes in the doc string */ + val start = tokens.slice(0, s).map(_.length).sum + + tokens.slice(0, s).size // account for spaces + val end = start + tokens.slice(s, e + 1).map(_.length).sum + + tokens.slice(s, e + 1).size // account for spaces + + val (l, t, r) = applyWindow(doc.toLowerCase, start, end) + + l.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) ++ + t.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) ++ + r.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) + } + def applyWindowUdf(wvectors: WordEmbeddings, codes: Map[String, Array[Double]]) = udf {(doc:String, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, targetTerm:String) => - val (l, t, r) = applyWindow(doc.toLowerCase, targetTerm.toLowerCase) var target = Array(0.1, -0.1) var nonTarget = Array(-0.1, 0.1) - val tmp = l.flatMap(w => wvectors.getEmbeddings(w)).map(_.toDouble) ++ + l.flatMap(w => wvectors.getEmbeddings(w)).map(_.toDouble) ++ t.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) ++ r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) - - Vectors.dense(tmp) - } - import scala.util.Random - val random = new Random() - var tt = Array(random.nextGaussian(), random.nextGaussian()) var nt = Array(-random.nextGaussian(), -random.nextGaussian()) - - /* same as above, but convert the resulting text in a vector */ def applyWindowUdf(wvectors: WordEmbeddings) = //here s and e are token number for start and end of target when split on " " - udf {(doc:String, targetTerm:String, s:Int, e:Int) => - val tokens = doc.split(" ").filter(_!="") - - /* now start and end are indexes in the doc string */ - val start = tokens.slice(0, s).map(_.length).sum + - tokens.slice(0, s).size // account for spaces - val end = start + tokens.slice(s, e + 1).map(_.length).sum + - tokens.slice(s, e + 1).size // account for spaces - - val (l, t, r) = applyWindow(doc.toLowerCase, start, end) - val vector : Array[Double] = l.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble)) ++ - t.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) ++ - r.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) - - - - - Vectors.dense(vector) + udf { (doc:String, targetTerm:String, s:Int, e:Int) => + Vectors.dense(applyWindow(wvectors)(doc, targetTerm, s, e)) } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index 692559ccbed45f..a2c6fbed12ea25 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -2,24 +2,20 @@ package com.johnsnowlabs.ml.logreg import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing -import org.apache.spark.ml.classification.{GBTClassifier, LogisticRegression, MultilayerPerceptronClassifier} -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{ColumnName, DataFrame, SparkSession} -//shitty spark! -import org.apache.spark.mllib.linalg.{Vectors => MlLibVectors} -import org.apache.spark.ml.linalg.{Vector => MlVector} -import org.apache.spark.mllib.linalg.{Vector => MlLibVector} object I2b2DatasetLogRegTest extends App with Windowing { - override val before = 12 - override val after = 12 + override val before = 11 + override val after = 13 implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[2]").getOrCreate() + import spark.implicits._ // directory of the i2b2 dataset val i2b2Dir = "/home/jose/Downloads/i2b2" @@ -31,45 +27,23 @@ object I2b2DatasetLogRegTest extends App with Windowing { val embeddingsDims = 200 val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" - //val embeddingsFile = s"/home/jose/embeddings/pubmed_i2b2.bin" val reader = new I2b2DatasetReader(embeddingsFile) - import spark.implicits._ val trainDataset = reader.readDataFrame(trainDatasetPath) .withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) .select($"features", $"label") println("trainDsSize: " + trainDataset.count) - val testDataset = reader.readDataFrame(testDatasetPath). - withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) + val testDataset = reader.readDataFrame(testDatasetPath) + .withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) .select($"features", $"label", $"text", $"target") println("testDsSize: " + testDataset.count) val model = train(trainDataset) case class TpFnFp(tp: Int, fn: Int, fp: Int) - import org.apache.spark.mllib.util.MLUtils - - val test = testDataset - .rdd.map(r => LabeledPoint(r.getAs[Double]("label"), - r.getAs[MlLibVector]("features"))) // Compute raw scores on the test set. - val predictionAndLabels = test.map { case LabeledPoint(label, features) => - val prediction = model.predict(features) - (prediction, label) - } - - - val tpTnFp = predictionAndLabels.map ({ case (pred, label) => - if (pred == label) TpFnFp(1, 0, 0) - else TpFnFp(0, 1, 1) - }).collect().reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) - - println(calcStat(tpTnFp.tp + tpTnFp.fn, tpTnFp.tp + tpTnFp.fp, tpTnFp.tp)) - - - /* val result = model.transform(testDataset.cache()) val tpTnFp = result.map ({ r => @@ -79,13 +53,9 @@ object I2b2DatasetLogRegTest extends App with Windowing { println(calcStat(tpTnFp.tp + tpTnFp.fn, tpTnFp.tp + tpTnFp.fp, tpTnFp.tp)) - val evaluator = new MulticlassClassificationEvaluator("f1").setMetricName("f1") - println("Test set f1 = " + evaluator.evaluate(result)) - val badGuys = result.filter(r => r.getAs[Double]("prediction") != r.getAs[Double]("label")).collect() println(badGuys) - val pred = result.select($"prediction").collect.map{ r => r.getAs[Double]("prediction") } @@ -95,52 +65,18 @@ object I2b2DatasetLogRegTest extends App with Windowing { } - println(confusionMatrix(pred, gold)) */ + println(confusionMatrix(pred, gold)) def train(dataFrame: DataFrame) = { -/* - import spark.implicits._ val lr = new LogisticRegression() - .setMaxIter(20) //20 - .setRegParam(0.00135) //0.0012 - .setElasticNetParam(0.8) //0.8 - .setTol(1.0) - .setStandardization(false) - - lr.fit(dataFrame) */ - - // Run training algorithm to build the model - val model = new LogisticRegressionWithLBFGS() - model.optimizer.setRegParam(0.0013) - model.setNumClasses(6) - - model.run(dataFrame.rdd - .map(r => LabeledPoint(r.getAs[Double]("label"), - r.getAs[MlLibVector]("features")))) - -/* - val layers = Array[Int](5630, 6) - - // 5078, 6 -> 0.8878302 - - // create the trainer and set its parameters - val trainer = new MultilayerPerceptronClassifier() - .setLayers(layers) - .setBlockSize(128) - .setSeed(1234L) - .setMaxIter(100) // 30 - .setTol(1E-6) //1E-5 - - trainer.fit(dataFrame.cache()) -*/ - + .setMaxIter(26) //20 + .setRegParam(0.00192) //0.0012 + .setElasticNetParam(0.9) //0.8 + lr.fit(dataFrame) } - /* TODO put in a common place */ def calcStat(correct: Long, predicted: Long, predictedCorrect: Long): (Float, Float, Float) = { - // prec = (predicted & correct) / predicted - // rec = (predicted & correct) / correct val prec = predictedCorrect.toFloat / predicted val rec = predictedCorrect.toFloat / correct val f1 = 2 * prec * rec / (prec + rec) @@ -153,16 +89,16 @@ object I2b2DatasetLogRegTest extends App with Windowing { val matrix : Map[T, MutableMap[T, Int]] = labels.map(label => (label, MutableMap(labels.zip(Array.fill(labels.size)(0)): _*))).toMap - predicted.zip(gold).foreach { case (p, g) => - matrix.get(p).get(g) += 1 - } - - /* sanity check */ - if(predicted.length ==matrix.map(map => map._2.values.sum).sum) - println("looks good") + predicted.zip(gold).foreach { case (p, g) => matrix.get(p).get(g) += 1} + /* sanity check, the confusion matrix should contain as many elements as there were used during training / prediction */ + assert(predicted.length ==matrix.map(map => map._2.values.sum).sum) matrix } - + // produces a org.apache.spark.ml.linalg.Vector + def convertToVectorUdf = udf {(array: Array[Double]) => + val tmp = Vectors.dense(array) + tmp + } } From a274039536deb0bc45b4e74df9d522a890c63e63 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 15:41:24 -0300 Subject: [PATCH 18/55] added tokenizers --- .../nlp/annotators/assertion/logreg/RegexTokenizer.scala | 8 ++++++++ .../nlp/annotators/assertion/logreg/SimpleTokenizer.scala | 8 ++++++++ .../nlp/annotators/assertion/logreg/Tokenizer.scala | 8 ++++++++ 3 files changed, 24 insertions(+) create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala new file mode 100644 index 00000000000000..ddf6f087d545a3 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala @@ -0,0 +1,8 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +/** + * Created by jose on 18/12/17. + */ +class RegexTokenizer { + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala new file mode 100644 index 00000000000000..d1f82aea1f4f5d --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala @@ -0,0 +1,8 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +/** + * Created by jose on 18/12/17. + */ +class SimpleTokenizer { + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala new file mode 100644 index 00000000000000..ca4eb220ee2682 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala @@ -0,0 +1,8 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +/** + * Created by jose on 18/12/17. + */ +class Tokenizer { + +} From bd8e6142379d9b0a752e7fa0d24243a1c30ffe3c Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 15:41:54 -0300 Subject: [PATCH 19/55] behavior for some tokenizers --- .../annotators/assertion/logreg/RegexTokenizer.scala | 11 ++++++++++- .../annotators/assertion/logreg/SimpleTokenizer.scala | 5 +++-- .../nlp/annotators/assertion/logreg/Tokenizer.scala | 6 +++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala index ddf6f087d545a3..2a12294f47cc56 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala @@ -3,6 +3,15 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg /** * Created by jose on 18/12/17. */ -class RegexTokenizer { +class RegexTokenizer extends Tokenizer{ + + /* these match the behavior we had when tokenizing sentences for word embeddings */ + val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", + "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") + + val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""" + val number_regex = """([0-9]{1,6})""" + + override def tokenize(sent: String): Array[String] = ??? } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala index d1f82aea1f4f5d..887501cd9a23d7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/SimpleTokenizer.scala @@ -3,6 +3,7 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg /** * Created by jose on 18/12/17. */ -class SimpleTokenizer { - +class SimpleTokenizer extends Tokenizer { + /* Tokenize a sentence splitting on spaces */ + def tokenize(sent: String) : Array[String] = sent.split(" ").map(_.trim).filter(_ != "") } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala index ca4eb220ee2682..be1bf44061d35c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala @@ -3,6 +3,10 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg /** * Created by jose on 18/12/17. */ -class Tokenizer { +trait Tokenizer { + + def tokenize(sent: String) : Array[String] + + } From fd99273795e38a9e5f1b8887f9b767f973de35c0 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 15:42:19 -0300 Subject: [PATCH 20/55] refactor in windowing code --- .../assertion/logreg/Windowing.scala | 31 +++---------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 76992937d35b97..e93e78f1c98bf7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -18,27 +18,20 @@ trait Windowing extends Serializable { val after : Int lazy val wordVectors: Option[WordEmbeddings] = None - val random = new Random() - - /* TODO: create a tokenizer class */ - /* these match the behavior we had when tokenizing sentences for word embeddings */ - val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", - "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") - - val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""" - val number_regex = """([0-9]{1,6})""" + val tokenizer : Tokenizer + val random = new Random() /* apply window, pad/truncate sentence according to window */ def applyWindow(doc: String, s: Int, e: Int): (Array[String], Array[String], Array[String]) = { val target = doc.slice(s, e) - val targetPart = tokenize(target.trim) + val targetPart = tokenizer.tokenize(target.trim) val leftPart = if (s == 0) Array[String]() - else tokenize(doc.slice(0, s).trim) + else tokenizer.tokenize(doc.slice(0, s).trim) val rightPart = if (e == doc.length) Array[String]() - else tokenize(doc.slice(e, doc.length).trim) + else tokenizer.tokenize(doc.slice(e, doc.length).trim) val (start, leftPadding) = if(leftPart.size >= before) @@ -92,16 +85,12 @@ trait Windowing extends Serializable { r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) } - var tt = Array(random.nextGaussian(), random.nextGaussian()) - var nt = Array(-random.nextGaussian(), -random.nextGaussian()) - def applyWindowUdf(wvectors: WordEmbeddings) = //here s and e are token number for start and end of target when split on " " udf { (doc:String, targetTerm:String, s:Int, e:Int) => Vectors.dense(applyWindow(wvectors)(doc, targetTerm, s, e)) } - def l2norm(xs: Array[Double]):Double = { import math._ sqrt(xs.map{ x => pow(x, 2)}.sum) @@ -112,15 +101,5 @@ trait Windowing extends Serializable { vec.map(element => element / norm) } - /* Tokenize a sentence taking care of punctuation */ - def tokenize(sent: String) : Array[String] = { - var tmp = sent - // replace special characters - punctuation.foreach(c => tmp = tmp.replace(c, " " + c + " ")) - tmp = tmp.replace(",", " ") - tmp = tmp.replace(""", "\"") - tmp = tmp.replace("'", " ' ") - tmp.split(" ").map(_.trim).filter(_ != "") - } } From 9cf8c4e5786e2d1047348e2addec07be9e0a3349 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 18 Dec 2017 15:42:48 -0300 Subject: [PATCH 21/55] some cleanup in test and comments --- .../ml/logreg/I2b2DatasetLogRegTest.scala | 11 +++++++---- .../com/johnsnowlabs/ml/logreg/I2b2Reader.scala | 15 ++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index a2c6fbed12ea25..f02c848d73ee27 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.ml.logreg -import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Tokenizer, Windowing} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions._ @@ -13,6 +13,7 @@ object I2b2DatasetLogRegTest extends App with Windowing { override val before = 11 override val after = 13 + override val tokenizer: Tokenizer = new SimpleTokenizer implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[2]").getOrCreate() import spark.implicits._ @@ -69,9 +70,9 @@ object I2b2DatasetLogRegTest extends App with Windowing { def train(dataFrame: DataFrame) = { val lr = new LogisticRegression() - .setMaxIter(26) //20 - .setRegParam(0.00192) //0.0012 - .setElasticNetParam(0.9) //0.8 + .setMaxIter(26) + .setRegParam(0.00192) + .setElasticNetParam(0.9) lr.fit(dataFrame) } @@ -101,4 +102,6 @@ object I2b2DatasetLogRegTest extends App with Windowing { val tmp = Vectors.dense(array) tmp } + + } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index 5f125c2ea4da12..c514870a72631f 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -1,19 +1,16 @@ package com.johnsnowlabs.ml.logreg -import java.io.File - import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, SparkSession} - +import java.io.File import scala.io.Source -/* -* datasetPath: a list of datasets, for example the 'beth' or 'partner' directories (each containing -* an ast and txt folder). -* -* */ +/** + * Reader for the i2b2 dataset + * +*/ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Windowing { @@ -24,7 +21,7 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi /* receives the location of a single dataset (e.g. 'beth'), * and returns a sequence of datapoins I2b2AnnotationAndText * */ - def read(path: String): Seq[I2b2AnnotationAndText] = { + private def read(path: String): Seq[I2b2AnnotationAndText] = { // read list of ast files, without extension val astFileNames = { From b72db8682579238f6f2bac1f9be3a232c0b5ca8b Mon Sep 17 00:00:00 2001 From: Alberto Date: Tue, 19 Dec 2017 08:43:38 -0300 Subject: [PATCH 22/55] work in progress --- .../logreg/AssertionLogRegApproach.scala | 79 +++++++++---------- .../logreg/AssertionLogRegModel.scala | 23 ++---- .../assertion/logreg/Tokenizer.scala | 2 +- .../assertion/logreg/Windowing.scala | 10 +-- .../ml/logreg/I2b2DatasetLogRegTest.scala | 11 ++- .../ml/logreg/I2b2DatasetPipelineTest.scala | 23 ++---- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 10 +-- .../assertion/SentenceWindowingTest.scala | 4 +- 8 files changed, 65 insertions(+), 97 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 6c0d28c358fba0..94dbbf007d8e19 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -1,12 +1,12 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType._ -import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings +import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} import com.johnsnowlabs.nlp.AnnotatorApproach -import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.param.Param +import org.apache.spark.sql.Dataset /** * Created by jose on 22/11/17. @@ -16,63 +16,60 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac override val requiredAnnotatorTypes = Array(DOCUMENT) //, POS override val description: String = "Clinical Text Status Assertion" + override val tokenizer: Tokenizer = new SimpleTokenizer + + lazy override val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) + override val annotatorType: AnnotatorType = ASSERTION def this() = this(Identifiable.randomUID("ASSERTION")) override lazy val localPath = "/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin.db" // example of possible values, 'Negated', 'Affirmed', 'Historical' val labelColumn = new Param[String](this, "label", "Column with one label per document") - // the document where we're extracting the assertion - val documentColumn = new Param[String](this, "documentColumn", "Column with one label per document") - + val documentColumn = new Param[String](this, "document", "Column with one label per document") // the target term, that must appear capitalized in the document, e.g., 'diabetes' - val targetColumn = new Param[String](this, "targetColumn", "Column with the target to analyze") - - override val (before, after) = (11, 13) - var tag2Vec : Map[String, Array[Double]] = Map() + val targetColumn = new Param[String](this, "target", "Column with the target to analyze") + val maxIter = new Param[Int](this, "maxIter", "Max number of iterations for algorithm") + val regParam = new Param[Double](this, "regParam", "Regularization parameter") + val eNetParam = new Param[Double](this, "eNetParam", "Elastic net parameter") + val beforeParam = new Param[Int](this, "before", "Length of the context before the target") + val afterParam = new Param[Int](this, "after", "Length of the context after the target") + + def setLabelCol(label: String) = set(labelColumn, label) + def setDocumentCol(document: String) = set(documentColumn, document) + def setTargetCol(target: String) = set(targetColumn, target) + def setMaxIter(max: Int) = set(maxIter, max) + def setReg(lambda: Double) = set(regParam, lambda) + def setEnet(enet: Double) = set(eNetParam, enet) + def setBefore(before: Int) = set(beforeParam, before) + def setAfter(after: Int) = set(afterParam, after) + + setDefault(labelColumn -> "label", + documentColumn -> "document", + targetColumn -> "target", + maxIter -> 26, + regParam -> 0.00192, + eNetParam -> 0.9, + beforeParam -> 10, + afterParam -> 10 + ) override def train(dataset: Dataset[_]): AssertionLogRegModel = { import dataset.sqlContext.implicits._ - // read the set of all tags - // val tagSet = inferTagSet(dataset.toDF) - // assign each tag an array of 3 floats - // tag2Vec = encode(tagSet) - /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - //withColumn("features", applyWindowUdf(embeddings.get, tag2Vec)($"text", $"pos", $"start", $"end", $"target")).cache() - withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target", $"start", $"end")).cache() - + withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) - /* TODO: pick the parameters you want to expose*/ val lr = new LogisticRegression() - .setMaxIter(26) - .setRegParam(0.00184) - .setElasticNetParam(0.9) + .setMaxIter(getOrDefault(maxIter)) + .setRegParam(getOrDefault(regParam)) + .setElasticNetParam(getOrDefault(eNetParam)) - fillModelEmbeddings(AssertionLogRegModel(lr.fit(processed), tag2Vec)) + fillModelEmbeddings(AssertionLogRegModel(lr.fit(processed))) } + override lazy val wordVectors: Option[WordEmbeddings] = embeddings - def inferTagSet(dataset: Dataset[Row]): Array[String] = - dataset.select("pos") - .collect() - .flatMap { row => - row.getAs[Seq[Row]](0).map(_.getString(3)).distinct - }.distinct - - - def encode(tagSet: Array[String]) : Map[String, Array[Double]]= { - val values = Array(.25, .50, .75, 1) - val codes = for (a <- values; - b <- values; - c <- values) yield { - import math.sqrt - val norm = sqrt(a * a + b * b + c * c) - Array(a/norm, b/norm, c/norm) - } - tagSet.sorted.zip(codes).toMap - } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index d8c994c2319282..6cb8e391adeb39 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT, POS} import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType} -import com.johnsnowlabs.nlp.embeddings.ModelWithWordEmbeddings +import com.johnsnowlabs.nlp.embeddings.{ModelWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} @@ -11,22 +11,11 @@ import org.apache.spark.sql.{DataFrame, Dataset} * Created by jose on 22/11/17. */ -class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, Array[Double]], override val uid: String = Identifiable.randomUID("ASSERTIOM")) +class AssertionLogRegModel(model:LogisticRegressionModel, override val uid: String = Identifiable.randomUID("ASSERTION")) extends AnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings with Windowing { - - // TODO this should come as a parameter override val (before, after) = (11, 13) - - /** - * takes a document and annotations and produces new annotations of this annotator's annotation type - * - * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any - * @return any number of annotations processed for every input annotation. Not necessary one to one relationship - */ + override val tokenizer: Tokenizer = new SimpleTokenizer override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations - //def this() = this(Identifiable.randomUID("ASSERTION")) - - override val annotatorType: AnnotatorType = AnnotatorType.ASSERTION override val requiredAnnotatorTypes = Array(DOCUMENT) //, POS override final def transform(dataset: Dataset[_]): DataFrame = { @@ -34,13 +23,13 @@ class AssertionLogRegModel(model:LogisticRegressionModel, tag2Vec: Map[String, A /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("features", applyWindowUdf(embeddings.get)($"text", $"target", $"start", $"end")).cache() //, $"text", $"pos", $"start", $"end", $"target" - //.select($"features", $"label") + withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) super.transform(model.transform(processed)) } + override lazy val wordVectors: Option[WordEmbeddings] = embeddings } object AssertionLogRegModel { - def apply(model: LogisticRegressionModel, tag2Vec: Map[String, Array[Double]]): AssertionLogRegModel = new AssertionLogRegModel(model, tag2Vec) + def apply(model: LogisticRegressionModel): AssertionLogRegModel = new AssertionLogRegModel(model) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala index be1bf44061d35c..38d62dc807ecdd 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala @@ -3,7 +3,7 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg /** * Created by jose on 18/12/17. */ -trait Tokenizer { +trait Tokenizer extends Serializable { def tokenize(sent: String) : Array[String] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index e93e78f1c98bf7..3103e986f73d89 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -4,7 +4,6 @@ import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions._ import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -import scala.util.Random import scala.collection.mutable @@ -17,10 +16,9 @@ trait Windowing extends Serializable { val before : Int val after : Int - lazy val wordVectors: Option[WordEmbeddings] = None val tokenizer : Tokenizer - val random = new Random() + lazy val wordVectors: Option[WordEmbeddings] = None /* apply window, pad/truncate sentence according to window */ def applyWindow(doc: String, s: Int, e: Int): (Array[String], Array[String], Array[String]) = { @@ -85,10 +83,10 @@ trait Windowing extends Serializable { r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) } - def applyWindowUdf(wvectors: WordEmbeddings) = - //here s and e are token number for start and end of target when split on " " + def applyWindowUdf = + //here 's' and 'e' are token number for start and end of target when split on " " udf { (doc:String, targetTerm:String, s:Int, e:Int) => - Vectors.dense(applyWindow(wvectors)(doc, targetTerm, s, e)) + Vectors.dense(applyWindow(wordVectors.get)(doc, targetTerm, s, e)) } def l2norm(xs: Array[Double]):Double = { diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index f02c848d73ee27..adaac002b28f1a 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -2,13 +2,12 @@ package com.johnsnowlabs.ml.logreg import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Tokenizer, Windowing} +import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions._ import org.apache.spark.sql.{ColumnName, DataFrame, SparkSession} - - object I2b2DatasetLogRegTest extends App with Windowing { override val before = 11 @@ -31,17 +30,17 @@ object I2b2DatasetLogRegTest extends App with Windowing { val reader = new I2b2DatasetReader(embeddingsFile) val trainDataset = reader.readDataFrame(trainDatasetPath) - .withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) + .withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) .select($"features", $"label") println("trainDsSize: " + trainDataset.count) val testDataset = reader.readDataFrame(testDatasetPath) - .withColumn("features", applyWindowUdf(reader.wordVectors.get)($"text", $"target", $"start", $"end")) + .withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) .select($"features", $"label", $"text", $"target") println("testDsSize: " + testDataset.count) - val model = train(trainDataset) + val model = train(trainDataset.cache()) case class TpFnFp(tp: Int, fn: Int, fp: Int) // Compute raw scores on the test set. @@ -103,5 +102,5 @@ object I2b2DatasetLogRegTest extends App with Windowing { tmp } - + override val wordVectors: Option[WordEmbeddings] = reader.wordVectors } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index b28057d8e3eda6..4673d9206cd7fe 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -4,11 +4,10 @@ import com.johnsnowlabs.ml.logreg.I2b2DatasetLogRegTest.{calcStat, confusionMatr import com.johnsnowlabs.nlp.DocumentAssembler import com.johnsnowlabs.nlp.annotators.RegexTokenizer import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach -import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.SparkSession +import org.apache.spark.storage.StorageLevel object I2b2DatasetPipelineTest extends App { @@ -32,24 +31,17 @@ object I2b2DatasetPipelineTest extends App { .setInputCols(Array("document")) .setOutputCol("token") - val posTagger = new PerceptronApproach() - .setCorpusPath("/anc-pos-corpus/") - .setNIterations(10) - .setInputCols("token", "document") - .setOutputCol("pos") - val assertionStatus = new AssertionLogRegApproach() - .setInputCols("document") //, "pos" - + .setInputCols("document") .setOutputCol("assertion") + .setBefore(11) + .setAfter(13) .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary) .setEmbeddingsFolder("/home/jose/Downloads/bio_nlp_vec") Array(documentAssembler, tokenizer, - //posTagger, assertionStatus) - } val reader = new I2b2DatasetReader(embeddingsFile) @@ -65,13 +57,13 @@ object I2b2DatasetPipelineTest extends App { val pipeline = new Pipeline() .setStages(getAssertionStages) - pipeline.fit(dataset) + pipeline.fit(dataset.cache()) } def testAssertionModel(path:Seq[String], model: PipelineModel) = { System.out.println("Test Dataset Reading") val dataset = reader.readDataFrame(path) - model.transform(dataset) + model.transform(dataset.cache()) } val model = trainAssertionModel(trainPaths) @@ -95,8 +87,5 @@ object I2b2DatasetPipelineTest extends App { r.getAs[Double]("label") } - println(confusionMatrix(pred, gold)) - - } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index c514870a72631f..da9a749ab09e09 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -1,6 +1,5 @@ package com.johnsnowlabs.ml.logreg -import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, SparkSession} @@ -12,9 +11,8 @@ import scala.io.Source * */ -class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Windowing { +class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable { - override val (before, after) = (8, 12) var fileDb = wordEmbeddingsFile + ".db" @@ -56,20 +54,16 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable with Wi * and returns a Spark DataFrame * */ def readDataFrame(datasetPaths: Seq[String]) (implicit session: SparkSession): DataFrame= { - //TODO: should windowing be here? import session.implicits._ datasetPaths.flatMap(read).filter(_!=null).toDF.withColumn("label", labelToNumber($"label")) - } - private val mappings = Map("hypothetical" -> 0.0, "present" -> 1.0, "absent" -> 2.0, "possible" -> 3.0, "conditional"-> 4.0, "associated_with_someone_else" -> 5.0) - /* TODO duplicated logic, consider relocation to common place */ - override lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { + lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { wordEmbeddingsFile => require(new File(wordEmbeddingsFile).exists()) val fileDb = wordEmbeddingsFile + ".db" diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala index 1985721b6613b0..42604791399d81 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp.annotators.assertion -import com.johnsnowlabs.nlp.annotators.assertion.logreg.Windowing +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Windowing} import org.scalatest.{FlatSpec, Matchers} /** @@ -11,6 +11,8 @@ class SentenceWindowingTest extends FlatSpec with Matchers { trait Scope extends Windowing { override val before: Int = 5 override val after: Int = 5 + override val tokenizer = new SimpleTokenizer + override val wordVectors = None } "sentences" should "be correctly padded" in new Scope { From 9f53eab2878fcd974cb3418f7051185dcc756c95 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 20 Dec 2017 00:29:40 -0300 Subject: [PATCH 23/55] added commons evaluation metrics class --- .../ml/common/EvaluationMetrics.scala | 44 +++++++++++++++++ .../ml/logreg/I2b2DatasetLogRegTest.scala | 48 ++++--------------- .../ml/logreg/I2b2DatasetPipelineTest.scala | 23 ++------- 3 files changed, 57 insertions(+), 58 deletions(-) create mode 100644 src/test/scala/com/johnsnowlabs/ml/common/EvaluationMetrics.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/common/EvaluationMetrics.scala b/src/test/scala/com/johnsnowlabs/ml/common/EvaluationMetrics.scala new file mode 100644 index 00000000000000..452d939e5ef598 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/common/EvaluationMetrics.scala @@ -0,0 +1,44 @@ +package com.johnsnowlabs.ml.common + + +/** + * Created by jose on 19/12/17. + */ +trait EvaluationMetrics { + + case class TpFnFp(tp: Int, fn: Int, fp: Int) + + def confusionMatrix[T](predicted: Seq[T], gold: Seq[T]) = { + val labels = gold.distinct + import scala.collection.mutable.{Map => MutableMap} + val matrix : Map[T, MutableMap[T, Int]] = + labels.map(label => (label, MutableMap(labels.zip(Array.fill(labels.size)(0)): _*))).toMap + + predicted.zip(gold).foreach { case (p, g) => matrix.get(p).get(g) += 1} + + /* sanity check, the confusion matrix should contain as many elements as there were used during training / prediction */ + assert(predicted.length ==matrix.map(map => map._2.values.sum).sum) + matrix + } + + + def calcStat[T](predicted: Seq[T], gold: Seq[T]):(Float, Float, Float) = { + val tpFnFp = predicted.zip(gold).map({case (p, g) => + if (p == g) + TpFnFp(1, 0, 0) + else + TpFnFp(0, 1, 1) + }).reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) + + calcStat(tpFnFp.tp + tpFnFp.fn, tpFnFp.tp + tpFnFp.fp, tpFnFp.tp) + } + + def calcStat(correct: Long, predicted: Long, predictedCorrect: Long): (Float, Float, Float) = { + val prec = predictedCorrect.toFloat / predicted + val rec = predictedCorrect.toFloat / correct + val f1 = 2 * prec * rec / (prec + rec) + (prec, rec, f1) + } + + +} diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index adaac002b28f1a..63c4a691ccabe1 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -1,14 +1,15 @@ package com.johnsnowlabs.ml.logreg +import com.johnsnowlabs.ml.common.EvaluationMetrics import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Tokenizer, Windowing} import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{ColumnName, DataFrame, SparkSession} +import org.apache.spark.sql.{DataFrame, SparkSession} -object I2b2DatasetLogRegTest extends App with Windowing { +object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { override val before = 11 override val after = 13 @@ -41,29 +42,17 @@ object I2b2DatasetLogRegTest extends App with Windowing { println("testDsSize: " + testDataset.count) val model = train(trainDataset.cache()) - case class TpFnFp(tp: Int, fn: Int, fp: Int) // Compute raw scores on the test set. val result = model.transform(testDataset.cache()) - val tpTnFp = result.map ({ r => - if (r.getAs[Double]("prediction") == r.getAs[Double]("label")) TpFnFp(1, 0, 0) - else TpFnFp(0, 1, 1) - }).collect().reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) - - println(calcStat(tpTnFp.tp + tpTnFp.fn, tpTnFp.tp + tpTnFp.fp, tpTnFp.tp)) - val badGuys = result.filter(r => r.getAs[Double]("prediction") != r.getAs[Double]("label")).collect() println(badGuys) - val pred = result.select($"prediction").collect.map{ r => - r.getAs[Double]("prediction") - } - - val gold = result.select($"label").collect.map{ r => - r.getAs[Double]("label") - } + val pred = result.select($"prediction").collect.map(_.getAs[Double]("prediction")) + val gold = result.select($"label").collect.map(_.getAs[Double]("label")) + println(calcStat(pred, gold)) println(confusionMatrix(pred, gold)) @@ -76,31 +65,10 @@ object I2b2DatasetLogRegTest extends App with Windowing { lr.fit(dataFrame) } - def calcStat(correct: Long, predicted: Long, predictedCorrect: Long): (Float, Float, Float) = { - val prec = predictedCorrect.toFloat / predicted - val rec = predictedCorrect.toFloat / correct - val f1 = 2 * prec * rec / (prec + rec) - (prec, rec, f1) - } - - def confusionMatrix[T](predicted: Seq[T], gold: Seq[T]) = { - val labels = gold.distinct - import scala.collection.mutable.{Map => MutableMap} - val matrix : Map[T, MutableMap[T, Int]] = - labels.map(label => (label, MutableMap(labels.zip(Array.fill(labels.size)(0)): _*))).toMap - - predicted.zip(gold).foreach { case (p, g) => matrix.get(p).get(g) += 1} - - /* sanity check, the confusion matrix should contain as many elements as there were used during training / prediction */ - assert(predicted.length ==matrix.map(map => map._2.values.sum).sum) - matrix - } - // produces a org.apache.spark.ml.linalg.Vector def convertToVectorUdf = udf {(array: Array[Double]) => - val tmp = Vectors.dense(array) - tmp + Vectors.dense(array) } - override val wordVectors: Option[WordEmbeddings] = reader.wordVectors + override lazy val wordVectors: Option[WordEmbeddings] = reader.wordVectors } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index 4673d9206cd7fe..558c50c3cc71d2 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -1,5 +1,6 @@ package com.johnsnowlabs.ml.logreg +import com.johnsnowlabs.ml.common.EvaluationMetrics import com.johnsnowlabs.ml.logreg.I2b2DatasetLogRegTest.{calcStat, confusionMatrix} import com.johnsnowlabs.nlp.DocumentAssembler import com.johnsnowlabs.nlp.annotators.RegexTokenizer @@ -9,7 +10,7 @@ import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel -object I2b2DatasetPipelineTest extends App { +object I2b2DatasetPipelineTest extends App with EvaluationMetrics{ implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[4]") .config("spark.executor.memory", "2g").getOrCreate @@ -69,23 +70,9 @@ object I2b2DatasetPipelineTest extends App { val model = trainAssertionModel(trainPaths) val result = testAssertionModel(testPaths, model) - /* TODO all this to common place */ - import spark.implicits._ - case class TpFnFp(tp: Int, fn: Int, fp: Int) - val tpFnFp = result.map ({ r => - if (r.getAs[Double]("prediction") == r.getAs[Double]("label")) TpFnFp(1, 0, 0) - else TpFnFp(0, 1, 1) - }).collect().reduce((t1, t2) => TpFnFp(t1.tp + t2.tp, t1.fn + t2.fn, t1.fp + t2.fp)) - - println(calcStat(tpFnFp.tp + tpFnFp.fn, tpFnFp.tp + tpFnFp.fp, tpFnFp.tp)) - - val pred = result.select($"prediction").collect.map{ r => - r.getAs[Double]("prediction") - } - - val gold = result.select($"label").collect.map{ r => - r.getAs[Double]("label") - } + val pred = result.select($"prediction").collect.map(_.getAs[Double]("prediction")) + val gold = result.select($"label").collect.map(_.getAs[Double]("label")) + println(calcStat(pred, gold)) println(confusionMatrix(pred, gold)) } From 8a035dd9ab404545fe884b36f8cdf250e6acef4e Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 22 Dec 2017 11:50:26 -0300 Subject: [PATCH 24/55] some cleanup --- .../assertion/logreg/AssertionLogRegApproach.scala | 4 ++-- .../ml/logreg/I2b2DatasetLogRegTest.scala | 12 +++--------- .../ml/logreg/I2b2DatasetPipelineTest.scala | 6 ++---- .../com/johnsnowlabs/ml/logreg/I2b2Reader.scala | 1 - 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 94dbbf007d8e19..5e1f816dc48d59 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -14,7 +14,7 @@ import org.apache.spark.sql.Dataset class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproach[AssertionLogRegModel] with AnnotatorWithWordEmbeddings with Windowing { - override val requiredAnnotatorTypes = Array(DOCUMENT) //, POS + override val requiredAnnotatorTypes = Array(DOCUMENT) override val description: String = "Clinical Text Status Assertion" override val tokenizer: Tokenizer = new SimpleTokenizer @@ -22,7 +22,7 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac override val annotatorType: AnnotatorType = ASSERTION def this() = this(Identifiable.randomUID("ASSERTION")) - override lazy val localPath = "/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin.db" + override lazy val localPath = getOrDefault(sourceEmbeddingsPath) // example of possible values, 'Negated', 'Affirmed', 'Historical' val labelColumn = new Param[String](this, "label", "Column with one label per document") diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index 63c4a691ccabe1..cf229a46543016 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -5,7 +5,6 @@ import com.johnsnowlabs.ml.common.EvaluationMetrics import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Tokenizer, Windowing} import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, SparkSession} @@ -14,6 +13,7 @@ object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { override val before = 11 override val after = 13 override val tokenizer: Tokenizer = new SimpleTokenizer + override lazy val wordVectors: Option[WordEmbeddings] = reader.wordVectors implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[2]").getOrCreate() import spark.implicits._ @@ -46,8 +46,8 @@ object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { // Compute raw scores on the test set. val result = model.transform(testDataset.cache()) - val badGuys = result.filter(r => r.getAs[Double]("prediction") != r.getAs[Double]("label")).collect() - println(badGuys) + val errors = result.filter(r => r.getAs[Double]("prediction") != r.getAs[Double]("label")).collect() + println(errors) val pred = result.select($"prediction").collect.map(_.getAs[Double]("prediction")) val gold = result.select($"label").collect.map(_.getAs[Double]("label")) @@ -65,10 +65,4 @@ object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { lr.fit(dataFrame) } - // produces a org.apache.spark.ml.linalg.Vector - def convertToVectorUdf = udf {(array: Array[Double]) => - Vectors.dense(array) - } - - override lazy val wordVectors: Option[WordEmbeddings] = reader.wordVectors } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index 558c50c3cc71d2..eff396cab054b7 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -1,16 +1,14 @@ package com.johnsnowlabs.ml.logreg import com.johnsnowlabs.ml.common.EvaluationMetrics -import com.johnsnowlabs.ml.logreg.I2b2DatasetLogRegTest.{calcStat, confusionMatrix} import com.johnsnowlabs.nlp.DocumentAssembler import com.johnsnowlabs.nlp.annotators.RegexTokenizer import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.SparkSession -import org.apache.spark.storage.StorageLevel -object I2b2DatasetPipelineTest extends App with EvaluationMetrics{ +object I2b2DatasetPipelineTest extends App with EvaluationMetrics { implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[4]") .config("spark.executor.memory", "2g").getOrCreate @@ -20,7 +18,7 @@ object I2b2DatasetPipelineTest extends App with EvaluationMetrics{ ,"/home/jose/Downloads/i2b2/concept_assertion_relation_training_data/beth") val testPaths = Seq("/home/jose/Downloads/i2b2/test_data") - val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin.db" def getAssertionStages(): Array[_ <: PipelineStage] = { diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index da9a749ab09e09..ec803e0c286120 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -15,7 +15,6 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable { var fileDb = wordEmbeddingsFile + ".db" - /* receives the location of a single dataset (e.g. 'beth'), * and returns a sequence of datapoins I2b2AnnotationAndText * */ From 935b49b7c2e636e5c9f5901f389b683bb259dd95 Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 16:46:37 -0300 Subject: [PATCH 25/55] corrections in tests, annotations, and labels --- .../logreg/AssertionLogRegApproach.scala | 20 +++- .../logreg/AssertionLogRegModel.scala | 113 ++++++++++++++++-- .../assertion/logreg/Windowing.scala | 2 +- .../AnnotatorWithWordEmbeddings.scala | 1 - .../ml/logreg/I2b2DatasetLogRegTest.scala | 10 +- .../ml/logreg/I2b2DatasetPipelineTest.scala | 32 +++-- .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 11 +- .../assertion/SentenceWindowingTest.scala | 16 ++- 8 files changed, 162 insertions(+), 43 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index c210173d3a1a52..377ee6094e4035 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -7,6 +7,7 @@ import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.param.Param import org.apache.spark.sql.Dataset +import org.apache.spark.sql.functions._ /** * Created by jose on 22/11/17. @@ -17,12 +18,12 @@ class AssertionLogRegApproach(override val uid: String) extends override val requiredAnnotatorTypes = Array(DOCUMENT) override val description: String = "Clinical Text Status Assertion" override val tokenizer: Tokenizer = new SimpleTokenizer + override lazy val wordVectors: Option[WordEmbeddings] = embeddings lazy override val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) override val annotatorType: AnnotatorType = ASSERTION def this() = this(Identifiable.randomUID("ASSERTION")) - //override lazy val localPath = getOrDefault(sourceEmbeddingsPath) // example of possible values, 'Negated', 'Affirmed', 'Historical' val labelColumn = new Param[String](this, "label", "Column with one label per document") @@ -67,9 +68,20 @@ class AssertionLogRegApproach(override val uid: String) extends .setRegParam(getOrDefault(regParam)) .setElasticNetParam(getOrDefault(eNetParam)) - AssertionLogRegModel(lr.fit(processed)) - } + val labelCol = getOrDefault(labelColumn) - override lazy val wordVectors: Option[WordEmbeddings] = embeddings + /* infer labels and assign a number to each */ + val labelMappings: Map[String, Double] = dataset.select(labelCol).distinct.collect + .map(row => row.getAs[String](labelCol)).zipWithIndex + .map{case (label, idx) => (label, idx.toDouble)} + .toMap + + val processedWithLabel = processed.withColumn(labelCol, labelToNumber(labelMappings)(col(labelCol))) + + AssertionLogRegModel() + .setLabelMap(labelMappings) + .setModel(lr.fit(processedWithLabel)) + } + private def labelToNumber(mappings: Map[String, Double]) = udf { label:String => mappings.get(label)} } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 0d6ec8788e0980..f09714f40ac8e7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -1,35 +1,124 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg -import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT, POS} -import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType} +import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT} +import com.johnsnowlabs.nlp.Annotation import com.johnsnowlabs.nlp.embeddings.{ModelWithWordEmbeddings, WordEmbeddings} -import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.ml.classification.LogisticRegressionModel +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.functions.udf + +import scala.collection.immutable.Map + /** * Created by jose on 22/11/17. */ -class AssertionLogRegModel(model:LogisticRegressionModel, override val uid: String = Identifiable.randomUID("ASSERTION")) +class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends ModelWithWordEmbeddings[AssertionLogRegModel] with Windowing { + override val (before, after) = (11, 13) override val tokenizer: Tokenizer = new SimpleTokenizer - override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations - override val annotatorType: AnnotatorType = AnnotatorType.ASSERTION - override val requiredAnnotatorTypes = Array(DOCUMENT) //, POS + override val annotatorType: AnnotatorType = ASSERTION + override val requiredAnnotatorTypes = Array(DOCUMENT) + override final def transform(dataset: Dataset[_]): DataFrame = { import dataset.sqlContext.implicits._ + require(model.isDefined, "model must be set before tagging") /* apply UDF to fix the length of each document */ val processed = dataset.toDF. withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) - super.transform(model.transform(processed)) + model.get.transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) + } + + private def packAnnotations = udf { (text: String, target: String, s: Int, e: Int, prediction: Double) => + val tokens = text.split(" ").filter(_!="") + + /* convert start and end are indexes in the doc string */ + val start = tokens.slice(0, s).map(_.length).sum + + tokens.slice(0, s).size // account for spaces + val end = start + tokens.slice(s, e + 1).map(_.length).sum + + tokens.slice(s, e + 1).size - 2 // account for spaces + + val annotation = Annotation("assertion", start, end, labelMap.get(prediction), Map()) + Seq(annotation) } + + override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations + override lazy val wordVectors: Option[WordEmbeddings] = embeddings + + var model: Option[LogisticRegressionModel] = None + var labelMap: Option[Map[Double, String]] = None + + def setModel(m: LogisticRegressionModel): AssertionLogRegModel = { + model = Some(m) + this + } + + def setLabelMap(labelMappings: Map[String, Double]) = { + labelMap = Some(labelMappings.map(_.swap)) + this + } + + override def write: MLWriter = new AssertionLogRegModel.AssertionModelWriter(this, super.write) } -object AssertionLogRegModel { - def apply(model: LogisticRegressionModel): AssertionLogRegModel = new AssertionLogRegModel(model) +object AssertionLogRegModel extends DefaultParamsReadable[AssertionLogRegModel] { + def apply(): AssertionLogRegModel = new AssertionLogRegModel() + override def read: MLReader[AssertionLogRegModel] = new AssertionModelReader(super.read) + + + class AssertionModelReader(baseReader: MLReader[AssertionLogRegModel]) extends MLReader[AssertionLogRegModel] { + override def load(path: String): AssertionLogRegModel = { + val instance = baseReader.load(path) + val modelPath = new Path(path, "model").toString + val loaded = LogisticRegressionModel.read.load(modelPath) + + val labelsPath = new Path(path, "labels").toString + val labelsLoaded = sparkSession.sqlContext.read.format("parquet") + .load(labelsPath) + .collect + .map(_.toString) + + val dict = labelsLoaded + .map {line => + val items = line.split(":") + (items(0).drop(1).toDouble, items(1).dropRight(1)) + } + .toMap + + instance + .setLabelMap(dict.map(_.swap)) + .setModel(loaded) + instance.deserializeEmbeddings(path, sparkSession.sparkContext) + instance + } + } + + class AssertionModelWriter(model: AssertionLogRegModel, baseWriter: MLWriter) extends MLWriter { + + override protected def saveImpl(path: String): Unit = { + require(model.model.isDefined, "Assertion Model must be defined before serialization") + require(model.labelMap.isDefined, "Label Map must be defined before serialization") + baseWriter.save(path) + val modelPath = new Path(path, "model").toString + model.model.get.save(modelPath) + + val spark = sparkSession + import spark.sqlContext.implicits._ + val labelsPath = new Path(path, "labels").toString + model.labelMap.get.toSeq.map(p => p._1 + ":" + p._2).toDS.write.mode("overwrite").parquet(labelsPath) + + model.serializeEmbeddings(path, sparkSession.sparkContext) + } + } + } + + + diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 3103e986f73d89..82f74cf1a48b7e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -64,7 +64,7 @@ trait Windowing extends Serializable { val start = tokens.slice(0, s).map(_.length).sum + tokens.slice(0, s).size // account for spaces val end = start + tokens.slice(s, e + 1).map(_.length).sum + - tokens.slice(s, e + 1).size // account for spaces + tokens.slice(s, e + 1).size - 1 // account for spaces val (l, t, r) = applyWindow(doc.toLowerCase, start, end) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala index c81041ce555d63..4b0735b054f8e2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala @@ -54,7 +54,6 @@ abstract class AnnotatorWithWordEmbeddings[A <: AnnotatorWithWordEmbeddings[A, M get(sourceEmbeddingsPath).map(_ => WordEmbeddings(localPath, $(embeddingsNDims))) } - private lazy val localPath: String = { WordEmbeddingsClusterHelper.createLocalPath } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala index a7fed100904625..099c3cff5de82f 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetLogRegTest.scala @@ -18,6 +18,10 @@ object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[2]").getOrCreate() import spark.implicits._ + val mappings = Map("hypothetical" -> 0.0, + "present" -> 1.0, "absent" -> 2.0, "possible" -> 3.0, + "conditional"-> 4.0, "associated_with_someone_else" -> 5.0) + // directory of the i2b2 dataset val i2b2Dir = "/home/jose/Downloads/i2b2" @@ -29,15 +33,17 @@ object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { val embeddingsDims = 200 // word embeddings location val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" - val reader = new I2b2DatasetReader(embeddingsFile) + val reader = new I2b2DatasetReader(wordEmbeddingsFile = embeddingsFile, targetLengthLimit = 8) val trainDataset = reader.readDataFrame(trainDatasetPath) .withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) + .withColumn("label", labelToNumber($"label")) .select($"features", $"label") println("trainDsSize: " + trainDataset.count) val testDataset = reader.readDataFrame(testDatasetPath) .withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) + .withColumn("label", labelToNumber($"label")) .select($"features", $"label", $"text", $"target") println("testDsSize: " + testDataset.count) @@ -66,4 +72,6 @@ object I2b2DatasetLogRegTest extends App with Windowing with EvaluationMetrics { lr.fit(dataFrame) } + def labelToNumber = udf { label:String => mappings.get(label)} + } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index 68f9bbd2005728..5ee156f27574b2 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -1,11 +1,11 @@ package com.johnsnowlabs.ml.logreg import com.johnsnowlabs.ml.common.EvaluationMetrics -import com.johnsnowlabs.nlp.DocumentAssembler -import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach +import com.johnsnowlabs.nlp.{Annotation, DocumentAssembler} +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{AssertionLogRegApproach, AssertionLogRegModel} import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Row, SparkSession} object I2b2DatasetPipelineTest extends App with EvaluationMetrics { @@ -17,8 +17,8 @@ object I2b2DatasetPipelineTest extends App with EvaluationMetrics { // word embeddings location val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" - val trainPaths = Seq(s"${i2b2Dir}/concept_assertion_relation_training_data/partners" - , s"${i2b2Dir}/concept_assertion_relation_training_data/beth") + val trainPaths = Seq(//s"${i2b2Dir}/concept_assertion_relation_training_data/partners", + s"${i2b2Dir}/concept_assertion_relation_training_data/beth") val testPaths = Seq(s"$i2b2Dir/test_data") @@ -29,6 +29,7 @@ object I2b2DatasetPipelineTest extends App with EvaluationMetrics { .setOutputCol("document") val assertionStatus = new AssertionLogRegApproach() + .setLabelCol("label") .setInputCols("document") .setOutputCol("assertion") .setBefore(11) @@ -39,13 +40,11 @@ object I2b2DatasetPipelineTest extends App with EvaluationMetrics { assertionStatus) } - val reader = new I2b2DatasetReader(embeddingsFile) + val reader = new I2b2DatasetReader(wordEmbeddingsFile = embeddingsFile, targetLengthLimit = 8) def trainAssertionModel(paths: Seq[String]): PipelineModel = { System.out.println("Train Dataset Reading") - val time = System.nanoTime() val dataset = reader.readDataFrame(paths) - System.out.println(s"Done, ${(System.nanoTime() - time)/1e9}\n") System.out.println("Start fitting") // train Assertion Status @@ -64,9 +63,22 @@ object I2b2DatasetPipelineTest extends App with EvaluationMetrics { val model = trainAssertionModel(trainPaths) val result = testAssertionModel(testPaths, model) - val pred = result.select($"prediction").collect.map(_.getAs[Double]("prediction")) - val gold = result.select($"label").collect.map(_.getAs[Double]("label")) + var pred = result.select($"assertion").collect.map(row => Annotation(row.getAs[Seq[Row]]("assertion").head).result) + var gold = result.select($"label").collect.map(_.getAs[String]("label")) println(calcStat(pred, gold)) println(confusionMatrix(pred, gold)) + + /* test serialization */ + val modelName = "assertion_model" + model.write.overwrite().save(modelName) + val readModel = PipelineModel.read.load(modelName) + + val otherResult = testAssertionModel(testPaths, readModel) + pred = otherResult.select($"assertion").collect.map(row => Annotation(row.getAs[Seq[Row]]("assertion").head).result) + gold = otherResult.select($"label").collect.map(_.getAs[String]("label")) + + println(calcStat(pred, gold)) + println(confusionMatrix(pred, gold)) + } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala index ec803e0c286120..eff8351689658e 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala @@ -11,7 +11,7 @@ import scala.io.Source * */ -class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable { +class I2b2DatasetReader(wordEmbeddingsFile: String, targetLengthLimit:Int) extends Serializable { var fileDb = wordEmbeddingsFile + ".db" @@ -38,7 +38,7 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable { } yield { val record = I2b2Annotation(annotation) val text = sourceTxt(record.sourceLine - 1) - if(record.target.split(" ").length > 8){ + if(record.target.split(" ").length > targetLengthLimit){ tooLong += 1 null } @@ -54,14 +54,10 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable { * */ def readDataFrame(datasetPaths: Seq[String]) (implicit session: SparkSession): DataFrame= { import session.implicits._ - datasetPaths.flatMap(read).filter(_!=null).toDF.withColumn("label", labelToNumber($"label")) + datasetPaths.flatMap(read).filter(_!=null).toDF //.withColumn("label", labelToNumber($"label")) } - private val mappings = Map("hypothetical" -> 0.0, - "present" -> 1.0, "absent" -> 2.0, "possible" -> 3.0, - "conditional"-> 4.0, "associated_with_someone_else" -> 5.0) - /* TODO duplicated logic, consider relocation to common place */ lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { wordEmbeddingsFile => require(new File(wordEmbeddingsFile).exists()) @@ -71,7 +67,6 @@ class I2b2DatasetReader(wordEmbeddingsFile: String) extends Serializable { }.filter(_ => new File(fileDb).exists()) .map(_ => WordEmbeddings(fileDb, 200)) - def labelToNumber = udf { label:String => mappings.get(label)} } case class I2b2Annotation(target: String, label: String, start:Int, end:Int, sourceLine:Int) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala index 63e6d440c67c66..c12350f25e621a 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala @@ -21,7 +21,7 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val result = applyWindow(doc, target) val expected = Array("empty_marker", "empty_marker", "empty_marker", "empty_marker", "the", "cat", "eats", "fish", "empty_marker", "empty_marker", "empty_marker") - assert(expected === result) + assert(expected === result.tupleToList) } "sentences" should "be correctly truncated" in new Scope { @@ -29,7 +29,7 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val target = "cat" val expected = "has been said that the cat eats fish while listens to".split(" ") val result = applyWindow(doc, target) - assert(expected === result) + assert(expected === result.tupleToList) } "multi word targets" should "be correctly identified" in new Scope{ @@ -37,7 +37,7 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val target = "the cat" val expected = "it has been said that the cat eats fish while listens".split(" ") val result = applyWindow(doc, target) - assert(expected === result) + assert(expected === result.tupleToList) } "targets in the border" should "be correctly identified - left" in new Scope { @@ -46,7 +46,7 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val expected = ("empty_marker empty_marker empty_marker empty_marker empty_marker " + "the cat eats fish while listens").split(" ") val result = applyWindow(doc, target) - assert(expected === result) + assert(expected === result.tupleToList) } "targets in the border" should "be correctly identified - right" in new Scope { @@ -54,7 +54,7 @@ class SentenceWindowingTest extends FlatSpec with Matchers { val target = "the cat" val expected = "it has been said that the cat empty_marker empty_marker empty_marker empty_marker ".split(" ") val result = applyWindow(doc, target) - assert(expected === result) + assert(expected === result.tupleToList) } "target occupies the whole text" should "be correctly chunked and padded" in new Scope { @@ -64,7 +64,11 @@ class SentenceWindowingTest extends FlatSpec with Matchers { "post-operative transient ischemic attack empty_marker empty_marker").split(" ") val result = applyWindow(doc, target) - assert(expected === result) + assert(expected === result.tupleToList) + } + + implicit class TupleOperations(t:Tuple3[Array[String], Array[String], Array[String]]) { + def tupleToList = t._1 ++ t._2 ++ t._3 } } From 4a7fd433c339b8d1c2b783a296b55157df3aafcc Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 16:55:26 -0300 Subject: [PATCH 26/55] restored complete i2b2 dataset --- .../com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index 5ee156f27574b2..85e3203190b075 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -17,7 +17,7 @@ object I2b2DatasetPipelineTest extends App with EvaluationMetrics { // word embeddings location val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" - val trainPaths = Seq(//s"${i2b2Dir}/concept_assertion_relation_training_data/partners", + val trainPaths = Seq(s"${i2b2Dir}/concept_assertion_relation_training_data/partners", s"${i2b2Dir}/concept_assertion_relation_training_data/beth") val testPaths = Seq(s"$i2b2Dir/test_data") From d6049a9dc14bcdd4a98c8d37b39e7382cca8759a Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 17:17:15 -0300 Subject: [PATCH 27/55] code for before & after parameters in model --- .../assertion/logreg/AssertionLogRegApproach.scala | 3 ++- .../assertion/logreg/AssertionLogRegModel.scala | 14 +++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 377ee6094e4035..1cd30a1c4ea6f7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -2,7 +2,6 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} -import com.johnsnowlabs.nlp.AnnotatorApproach import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.param.Param @@ -79,6 +78,8 @@ class AssertionLogRegApproach(override val uid: String) extends val processedWithLabel = processed.withColumn(labelCol, labelToNumber(labelMappings)(col(labelCol))) AssertionLogRegModel() + .setBefore(getOrDefault(beforeParam)) + .setAfter(getOrDefault(afterParam)) .setLabelMap(labelMappings) .setModel(lr.fit(processedWithLabel)) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index f09714f40ac8e7..280594ed2d1637 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -7,6 +7,7 @@ import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.hadoop.fs.Path +import org.apache.spark.ml.param.Param import org.apache.spark.sql.functions.udf import scala.collection.immutable.Map @@ -19,7 +20,18 @@ import scala.collection.immutable.Map class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends ModelWithWordEmbeddings[AssertionLogRegModel] with Windowing { - override val (before, after) = (11, 13) + val beforeParam = new Param[Int](this, "before", "Length of the context before the target") + val afterParam = new Param[Int](this, "after", "Length of the context after the target") + override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) + + setDefault( + beforeParam -> 11, + afterParam -> 13 + ) + + def setBefore(before: Int) = set(beforeParam, before) + def setAfter(after: Int) = set(afterParam, after) + override val tokenizer: Tokenizer = new SimpleTokenizer override val annotatorType: AnnotatorType = ASSERTION override val requiredAnnotatorTypes = Array(DOCUMENT) From f319eca45ef66d2203db8c7cb00b10f09a8f51bb Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 17:17:23 -0300 Subject: [PATCH 28/55] cosmetic --- src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala b/src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala index 3b3b367abfb9d1..bee0a1240ade8a 100644 --- a/src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala +++ b/src/main/scala/com/johnsnowlabs/ml/crf/DatasetEncoder.scala @@ -128,4 +128,4 @@ class DatasetEncoder(val startLabel: String = "@#Start") { result } } -} \ No newline at end of file +} From 26f4366f5ea5bf1e8a32d06eaabb224e07560922 Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 17:18:58 -0300 Subject: [PATCH 29/55] cosmetic --- .../scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala index a6db84ad632e1b..b64c5fb33bfc81 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/Tagged.scala @@ -105,6 +105,4 @@ object NerTagged extends Tagged[NerTaggedSentence]{ labels.zip(sentences) } } - - } From c092b2c52ac6d82386ab8de5d2c8eba3b600cd4f Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 17:24:23 -0300 Subject: [PATCH 30/55] cleanup --- .../johnsnowlabs/ml/logreg/convert_negex.py | 25 ------------------- .../assertion/AssertionStatusTest.scala | 14 ----------- 2 files changed, 39 deletions(-) delete mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py delete mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py b/src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py deleted file mode 100644 index 4ef5e3afca9ac2..00000000000000 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/convert_negex.py +++ /dev/null @@ -1,25 +0,0 @@ -''' - -Simple script to convert the negex files to CSV. -Merge all annotations to form a single classification problem. - -''' - -import sys, re - -if len(sys.argv) < 2: - print 'Which file?' - exit(1) - -delimiter = ',' - -with open(sys.argv[1]) as input, open(sys.argv[1] + '.csv', 'w') as output: - total = 0 - output.write('sentence\ttarget\tlabel\n') - for line in input: - total += 1 - chunks = line.split('\t') - if len(chunks[2].split(' ')) > 4: - continue - print(chunks) - output.write(chunks[2] + '\t' + chunks[1] + '\t' + chunks[3] + '\n') diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala deleted file mode 100644 index aeac41b14163ce..00000000000000 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/AssertionStatusTest.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.johnsnowlabs.nlp.annotators.assertion - -import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach - -/** - * Created by jose on 22/11/17. - */ -class AssertionStatusTest extends App { - - object assertion extends AssertionLogRegApproach { - - } - -} From fbb81069cbf31261be72d776bfbbb52db4e27568 Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 23 Dec 2017 17:25:03 -0300 Subject: [PATCH 31/55] cosmetic --- .../nlp/annotators/ner/crf/NerCrfApproachSpec.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala index 8cfe7e79d359ba..a5edb908a5640f 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproachSpec.scala @@ -75,5 +75,4 @@ class NerCrfApproachSpec extends FlatSpec { assert(tags == Seq("PER", "PER", "LOC")) } - -} \ No newline at end of file +} From d2b8dc8aec63d1483cf29c26ce9e4d12ee417aa5 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 11 Jan 2018 11:00:36 -0300 Subject: [PATCH 32/55] minor changes --- .../nlp/annotators/assertion/logreg/AssertionLogRegModel.scala | 3 +++ .../nlp/annotators/assertion/SentenceWindowingTest.scala | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 280594ed2d1637..be78d9d7de6409 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -37,6 +37,9 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS override val requiredAnnotatorTypes = Array(DOCUMENT) override final def transform(dataset: Dataset[_]): DataFrame = { + require(validate(dataset.schema), s"Missing annotators in pipeline. Make sure the following are present: " + + s"${requiredAnnotatorTypes.mkString(", ")}") + import dataset.sqlContext.implicits._ require(model.isDefined, "model must be set before tagging") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala index c12350f25e621a..5077064bb78664 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/SentenceWindowingTest.scala @@ -32,7 +32,7 @@ class SentenceWindowingTest extends FlatSpec with Matchers { assert(expected === result.tupleToList) } - "multi word targets" should "be correctly identified" in new Scope{ + "multi word targets" should "be correctly identified" in new Scope { val doc = "it has been said that the cat eats fish while listens to the rain" val target = "the cat" val expected = "it has been said that the cat eats fish while listens".split(" ") From 35c57e57703294d8078df725c1d8c31c77477900 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 11 Jan 2018 13:30:36 -0300 Subject: [PATCH 33/55] implemented simple version of the regex tokenizer --- .../assertion/logreg/RegexTokenizer.scala | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala index 2a12294f47cc56..b49765e59d8ef8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala @@ -9,9 +9,21 @@ class RegexTokenizer extends Tokenizer{ val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") - val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""" - val number_regex = """([0-9]{1,6})""" + val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""".r + val number_regex = """([0-9]{1,6})""".r - override def tokenize(sent: String): Array[String] = ??? + override def tokenize(sent: String): Array[String] = { + // replace percentage + var tmp = percent_regex.replaceAllIn(sent, " percentnum ") + + // unbind special chars + for (c <- punctuation) { + tmp = tmp.replaceAllLiterally(c, " " + c + " ") + } + + // replace any num + val result = number_regex.replaceAllIn(tmp, " digitnum ").toLowerCase.split(" ").filter(_!="") + result + } } From abc4d431c362cd4e8fc79b3cf0173840cfebebb8 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 18 Jan 2018 10:47:08 -0300 Subject: [PATCH 34/55] added html documentation for assertion status --- docs/components.html | 103 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 98 insertions(+), 5 deletions(-) diff --git a/docs/components.html b/docs/components.html index 4f2584f371b5e8..b15927666e66d7 100644 --- a/docs/components.html +++ b/docs/components.html @@ -977,7 +977,8 @@

14. ViveknSentimentDetec -

15. Finisher: Getting data out

+ +

15. AssertionStatus: Assertion Status Classifier

-

16. TokenAssembler: Getting data reshaped

+

17. TokenAssembler: Getting data reshaped

From 705ae69cc148bf38a13eedf096db829f57a056e1 Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 19 Jan 2018 16:13:16 -0300 Subject: [PATCH 35/55] work in progress for notepad --- .../example/logreg-assertion/assertion.ipynb | 266 ++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 python/example/logreg-assertion/assertion.ipynb diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb new file mode 100644 index 00000000000000..b78bf9bccd3461 --- /dev/null +++ b/python/example/logreg-assertion/assertion.ipynb @@ -0,0 +1,266 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../../')\n", + "\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.ml import Pipeline\n", + "\n", + "from sparknlp.annotator import *\n", + "from sparknlp.common import *\n", + "from sparknlp.base import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Paths to i2b2 dataset\n", + "i2b2Dir = '/home/jose/Downloads/i2b2/concept_assertion_relation_training_data'\n", + "trainPaths = [i2b2Dir + '/concept_assertion_relation_training_data/partners',\n", + " i2b2Dir + '/concept_assertion_relation_training_data/beth']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "spark = SparkSession.builder \\\n", + " .appName(\"assertion-status\")\\\n", + " .master(\"local[2]\")\\\n", + " .config(\"spark.driver.memory\",\"4G\")\\\n", + " .config(\"spark.driver.maxResultSize\", \"2G\")\\\n", + " .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n", + " .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. i2b2 dataset needs registration prior to download.\n", + "2. training data is spread accross two directories, 'beth' and 'partners'." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tokenizer' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mdocumentAssembler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0msentenceDetector\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mposTagger\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mnerTagger\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'tokenizer' is not defined" + ] + } + ], + "source": [ + "import time\n", + "\n", + "\n", + "embeddingsFile = \n", + "\n", + "documentAssembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "assertion = AssertionStatusApproach()\\\n", + " .setLabelCol(\"label\") \n", + " .setInputCols(\"document\") \n", + " .setOutputCol(\"assertion\") \n", + " .setBefore(11) # set window parameters\n", + " .setAfter(13)\n", + " .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary)\n", + "\n", + "finisher = Finisher() \\\n", + " .setInputCols([\"assertion\"]) \\\n", + " .setIncludeKeys(True)\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " documentAssembler,\n", + " sentenceDetector,\n", + " posTagger,\n", + " finisher\n", + " ])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+---------+--------------------+\n", + "|itemid|sentiment| text|\n", + "+------+---------+--------------------+\n", + "| 1| 0| ...|\n", + "| 2| 0| ...|\n", + "| 3| 1| omg...|\n", + "| 4| 0| .. Omga...|\n", + "| 5| 0| i think ...|\n", + "| 6| 0| or i jus...|\n", + "| 7| 1| Juuuuuuuuu...|\n", + "| 8| 0| Sunny Agai...|\n", + "| 9| 1| handed in m...|\n", + "| 10| 1| hmmmm.... i...|\n", + "| 11| 0| I must thin...|\n", + "| 12| 1| thanks to a...|\n", + "| 13| 0| this weeken...|\n", + "| 14| 0| jb isnt show...|\n", + "| 15| 0| ok thats it ...|\n", + "| 16| 0| <-------- ...|\n", + "| 17| 0| awhhe man.......|\n", + "| 18| 1| Feeling stran...|\n", + "| 19| 0| HUGE roll of ...|\n", + "| 20| 0| I just cut my...|\n", + "+------+---------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "#Load the input data to be annotated\n", + "data = spark. \\\n", + " read. \\\n", + " parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n", + " limit(1000)\n", + "data.cache()\n", + "data.count()\n", + "data.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start fitting\n", + "Fitting is ended\n" + ] + } + ], + "source": [ + "print(\"Start fitting\")\n", + "model = pipeline.fit(data)\n", + "print(\"Fitting is ended\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+---------+--------------------+--------------------+\n", + "|itemid|sentiment| text| finished_ner|\n", + "+------+---------+--------------------+--------------------+\n", + "| 1| 0| ...|word->is#result->...|\n", + "| 2| 0| ...|word->I#result->O...|\n", + "| 3| 1| omg...|word->omg#result-...|\n", + "| 4| 0| .. Omga...|word->..#result->...|\n", + "| 5| 0| i think ...|word->i#result->O...|\n", + "| 6| 0| or i jus...|word->or#result->...|\n", + "| 7| 1| Juuuuuuuuu...|word->Juuuuuuuuuu...|\n", + "| 8| 0| Sunny Agai...|word->Sunny#resul...|\n", + "| 9| 1| handed in m...|word->handed#resu...|\n", + "| 10| 1| hmmmm.... i...|word->i#result->O...|\n", + "| 11| 0| I must thin...|word->I#result->O...|\n", + "| 12| 1| thanks to a...|word->thanks#resu...|\n", + "| 13| 0| this weeken...|word->this#result...|\n", + "| 14| 0| jb isnt show...|word->jb#result->...|\n", + "| 15| 0| ok thats it ...|word->ok#result->...|\n", + "| 16| 0| <-------- ...|word-><-------...|\n", + "| 17| 0| awhhe man.......|word->awhhe#resul...|\n", + "| 18| 1| Feeling stran...|word->Feeling#res...|\n", + "| 19| 0| HUGE roll of ...|word->HUGE#result...|\n", + "| 20| 0| I just cut my...|word->I#result->O...|\n", + "+------+---------+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "ner_data = model.transform(data)\n", + "ner_data.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.write().overwrite().save(\"./ner_pipeline\")\n", + "model.write().overwrite().save(\"./ner_model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "from pyspark.ml import PipelineModel, Pipeline\n", + "\n", + "Pipeline.read().load(\"./ner_pipeline\")\n", + "sameModel = PipelineModel.read().load(\"./ner_model\")" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From fdc59b83a9c0296efa9bfc52b77054f423e5d730 Mon Sep 17 00:00:00 2001 From: Alberto Date: Mon, 22 Jan 2018 14:49:12 -0300 Subject: [PATCH 36/55] refactor to include Dataset interface in models --- .../com/johnsnowlabs/nlp/AnnotatorModel.scala | 73 ++++++------------- .../johnsnowlabs/nlp/BaseAnnotatorModel.scala | 42 +++++++++++ .../nlp/DatasetAnnotatorModel.scala | 11 +++ .../logreg/AssertionLogRegApproach.scala | 3 +- .../logreg/AssertionLogRegModel.scala | 6 +- .../annotators/ner/crf/NerCrfApproach.scala | 4 +- .../nlp/annotators/ner/crf/NerCrfModel.scala | 3 +- .../AnnotatorWithWordEmbeddings.scala | 8 +- .../embeddings/ModelWithWordEmbeddings.scala | 8 +- 9 files changed, 94 insertions(+), 64 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 1ab889583c4635..1c8f8ed24e265e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -1,67 +1,25 @@ package com.johnsnowlabs.nlp import org.apache.spark.ml.Model -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.util.DefaultParamsWritable +import org.apache.spark.sql._ import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.{DataFrame, Dataset, Row} -import org.apache.spark.sql.types._ -import org.apache.spark.sql.functions.{array, udf} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.MetadataBuilder /** - * This trait implements logic that applies nlp using Spark ML Pipeline transformers - * Should strongly change once UsedDefinedTypes are allowed - * https://issues.apache.org/jira/browse/SPARK-7768 + * Created by jose on 21/01/18. */ -abstract class AnnotatorModel[M <: Model[M]] - extends Model[M] - with DefaultParamsWritable - with HasAnnotatorType - with HasInputAnnotationCols - with HasOutputAnnotationCol { - - /** - * internal types to show Rows as a relevant StructType - * Should be deleted once Spark releases UserDefinedTypes to @developerAPI - */ - private type AnnotationContent = Seq[Row] - - /** - * takes a document and annotations and produces new annotations of this annotator's annotation type - * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any - * @return any number of annotations processed for every input annotation. Not necessary one to one relationship - */ - protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] - - /** - * Wraps annotate to happen inside SparkSQL user defined functions in order to act with [[org.apache.spark.sql.Column]] - * @return udf function to be applied to [[inputCols]] using this annotator's annotate function as part of ML transformation - */ - private def dfAnnotate: UserDefinedFunction = udf { - annotatorProperties: Seq[AnnotationContent] => - annotate(annotatorProperties.flatMap(_.map(Annotation(_)))) - } - - /** Shape of annotations at output */ - private def outputDataType: DataType = ArrayType(Annotation.dataType) - - /** requirement for pipeline transformation validation. It is called on fit() */ - override final def transformSchema(schema: StructType): StructType = { - val metadataBuilder: MetadataBuilder = new MetadataBuilder() - metadataBuilder.putString("annotatorType", annotatorType) - val outputFields = schema.fields :+ - StructField(getOutputCol, outputDataType, nullable = false, metadataBuilder.build) - StructType(outputFields) - } +abstract class AnnotatorModel[M <: Model[M]] extends BaseAnnotatorModel[M] { /** * Given requirements are met, this applies ML transformation within a Pipeline or stand-alone * Output annotation will be generated as a new column, previous annotations are still available separately * metadata is built at schema level to record annotations structural information outside its content + * * @param dataset [[Dataset[Row]]] * @return */ - override def transform(dataset: Dataset[_]): DataFrame = { + override final def transform(dataset: Dataset[_]): DataFrame = { require(validate(dataset.schema), s"Missing annotators in pipeline. Make sure the following are present: " + s"${requiredAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() @@ -74,8 +32,19 @@ abstract class AnnotatorModel[M <: Model[M]] ) } + /** + * Wraps annotate to happen inside SparkSQL user defined functions in order to act with [[org.apache.spark.sql.Column]] + * @return udf function to be applied to [[inputCols]] using this annotator's annotate function as part of ML transformation + */ + private def dfAnnotate: UserDefinedFunction = udf { + annotatorProperties: Seq[AnnotationContent] => + annotate(annotatorProperties.flatMap(_.map(Annotation(_)))) + } - /** requirement for annotators copies */ - override def copy(extra: ParamMap): M = defaultCopy(extra) + /** + * internal types to show Rows as a relevant StructType + * Should be deleted once Spark releases UserDefinedTypes to @developerAPI + */ + private type AnnotationContent = Seq[Row] -} \ No newline at end of file +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala new file mode 100644 index 00000000000000..d76efe69c1e404 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala @@ -0,0 +1,42 @@ +package com.johnsnowlabs.nlp + +import org.apache.spark.ml.Model +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.util.DefaultParamsWritable +import org.apache.spark.sql.types._ + +/** + * This trait implements logic that applies nlp using Spark ML Pipeline transformers + * Should strongly change once UsedDefinedTypes are allowed + * https://issues.apache.org/jira/browse/SPARK-7768 + */ +abstract class BaseAnnotatorModel[M <: Model[M]] + extends Model[M] + with DefaultParamsWritable + with HasAnnotatorType + with HasInputAnnotationCols + with HasOutputAnnotationCol { + + /** + * takes a document and annotations and produces new annotations of this annotator's annotation type + * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return any number of annotations processed for every input annotation. Not necessary one to one relationship + */ + protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] + + /** Shape of annotations at output */ + private def outputDataType: DataType = ArrayType(Annotation.dataType) + + /** requirement for pipeline transformation validation. It is called on fit() */ + override final def transformSchema(schema: StructType): StructType = { + val metadataBuilder: MetadataBuilder = new MetadataBuilder() + metadataBuilder.putString("annotatorType", annotatorType) + val outputFields = schema.fields :+ + StructField(getOutputCol, outputDataType, nullable = false, metadataBuilder.build) + StructType(outputFields) + } + + /** requirement for annotators copies */ + override def copy(extra: ParamMap): M = defaultCopy(extra) + +} \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala new file mode 100644 index 00000000000000..04f3e86a184dce --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala @@ -0,0 +1,11 @@ +package com.johnsnowlabs.nlp + +import org.apache.spark.ml.Model + +/** + * Created by jose on 21/01/18. + * This class allows for model evaluation happening on distributed Spark collections + */ +trait DatasetAnnotatorModel[M <: Model[M]] extends BaseAnnotatorModel[M] { + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 1cd30a1c4ea6f7..4dcdc263d8726a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -1,5 +1,6 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg +import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegression @@ -11,7 +12,7 @@ import org.apache.spark.sql.functions._ /** * Created by jose on 22/11/17. */ -class AssertionLogRegApproach(override val uid: String) extends +class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproach[AssertionLogRegModel] with AnnotatorWithWordEmbeddings[AssertionLogRegApproach, AssertionLogRegModel] with Windowing { override val requiredAnnotatorTypes = Array(DOCUMENT) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index be78d9d7de6409..70df2333b6b705 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT} -import com.johnsnowlabs.nlp.Annotation +import com.johnsnowlabs.nlp.{Annotation, DatasetAnnotatorModel} import com.johnsnowlabs.nlp.embeddings.{ModelWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} @@ -18,8 +18,10 @@ import scala.collection.immutable.Map */ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) - extends ModelWithWordEmbeddings[AssertionLogRegModel] with Windowing { + extends DatasetAnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings[AssertionLogRegModel] + with Windowing { + /* remove these Params */ val beforeParam = new Param[Int](this, "before", "Length of the context before the target") val afterParam = new Param[Int](this, "after", "Length of the context after the target") override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala index 8a86b9a6e99f38..fc5707b9de47da 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala @@ -18,8 +18,8 @@ import org.apache.spark.sql.{DataFrame, Dataset} /* Algorithm for training Named Entity Recognition Model. */ -class NerCrfApproach(override val uid: String) - extends AnnotatorWithWordEmbeddings[NerCrfApproach, NerCrfModel] { +class NerCrfApproach(override val uid: String) extends AnnotatorApproach[NerCrfModel] + with AnnotatorWithWordEmbeddings[NerCrfApproach, NerCrfModel] { def this() = this(Identifiable.randomUID("NER")) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala index 8a9c179326f385..b3154796c7c659 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala @@ -15,7 +15,8 @@ import org.apache.spark.sql.{Encoders, Row} /* Named Entity Recognition model */ -class NerCrfModel(override val uid: String) extends ModelWithWordEmbeddings[NerCrfModel]{ +class NerCrfModel(override val uid: String) extends AnnotatorModel[NerCrfModel] +with ModelWithWordEmbeddings[NerCrfModel]{ def this() = this(Identifiable.randomUID("NER")) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala index 4b0735b054f8e2..a61c169cc73c70 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala @@ -4,9 +4,10 @@ import java.io.File import java.nio.file.Files import java.util.UUID -import com.johnsnowlabs.nlp.AnnotatorApproach +import com.johnsnowlabs.nlp.{AnnotatorApproach, BaseAnnotatorModel} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext +import org.apache.spark.ml.Model import org.apache.spark.ml.param.{IntParam, Param} import org.apache.spark.sql.SparkSession @@ -20,8 +21,9 @@ import org.apache.spark.sql.SparkSession * 3. Than this index file is spread across the cluster. * 4. Every model 'ModelWithWordEmbeddings' uses local RocksDB as Word Embeddings lookup. */ -abstract class AnnotatorWithWordEmbeddings[A <: AnnotatorWithWordEmbeddings[A, M], M <: ModelWithWordEmbeddings[M]] - extends AnnotatorApproach[M] with AutoCloseable { +trait AnnotatorWithWordEmbeddings[A <: AnnotatorWithWordEmbeddings[A, M], M <: BaseAnnotatorModel[M] + with ModelWithWordEmbeddings[M]] extends AutoCloseable { + this:AnnotatorApproach[M] => val sourceEmbeddingsPath = new Param[String](this, "sourceEmbeddingsPath", "Word embeddings file") val embeddingsFormat = new IntParam(this, "embeddingsFormat", "Word vectors file format") diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala index fd8e72e5094b0b..027ef945456a2d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ModelWithWordEmbeddings.scala @@ -3,9 +3,10 @@ package com.johnsnowlabs.nlp.embeddings import java.io.File import java.nio.file.{Files, Paths} -import com.johnsnowlabs.nlp.AnnotatorModel +import com.johnsnowlabs.nlp.BaseAnnotatorModel import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.ivy.util.FileUtil +import org.apache.spark.ml.Model import org.apache.spark.{SparkContext, SparkFiles} import org.apache.spark.ml.param.{IntParam, Param} @@ -16,8 +17,9 @@ import org.apache.spark.ml.param.{IntParam, Param} * * Corresponding Approach have to implement AnnotatorWithWordEmbeddings */ -abstract class ModelWithWordEmbeddings[M <: ModelWithWordEmbeddings[M]] - extends AnnotatorModel[M] with AutoCloseable { +trait ModelWithWordEmbeddings[M <: ModelWithWordEmbeddings[M] with Model[M]] + extends AutoCloseable { + this:BaseAnnotatorModel[M] => val nDims = new IntParam(this, "nDims", "Number of embedding dimensions") val indexPath = new Param[String](this, "indexPath", "File that stores Index") From e6dfcdb2ff221b1e4fa6e2d225657d78d0e3d53e Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 11:14:20 -0300 Subject: [PATCH 37/55] added reader for negex dataset --- .../ml/logreg/NegexDatasetReader.scala | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala new file mode 100644 index 00000000000000..5fbe7bebf46645 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala @@ -0,0 +1,68 @@ +package com.johnsnowlabs.ml.logreg + +import java.io.File + +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Tokenizer, Windowing} +import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions.udf +import scala.io.Source + +/** + * Reader for this dataset, + * https://github.com/mongoose54/negex/blob/master/genConText/rsAnnotations-1-120-random.txt + */ + +class NegexDatasetReader(targetLengthLimit: Int = 10) extends Serializable { + + private def getTargetIndices(sentence: String, target: String) = { + + val targetTokens = target.split(" ").map(_.trim.toUpperCase).filter(_!="") + val firstTargetIdx = sentence.split(" ").map(_.trim). + indexOfSlice(targetTokens) + val lastTargetIdx = firstTargetIdx + targetTokens.size - 1 + + if( lastTargetIdx < 0 || firstTargetIdx <0) + print(sentence) + (firstTargetIdx, lastTargetIdx) + } + + val specialChars = Seq(',', '.', ';', '.', ':', '/', '"') + + // these lines are ill formed + val blackList = Seq("2149", "1826", "987", "1321") + + def readDataframe(datasetPath: String)(implicit session:SparkSession):DataFrame = { + import session.implicits._ + val dataframe = Source.fromFile(datasetPath).getLines + .map{ line => + line.flatMap{ // separate special chars + case c if specialChars.contains(c)=> s" $c " + case c => Seq(c) + } + } + .filter{line => + // target must be smaller than right context + line.split("\t")(2).split(" ").filter(_!="").length < targetLengthLimit && + // line must contain the target + line.split("\t")(3).contains(line.split("\t")(2).toUpperCase) && + // skip broken lines + !blackList.exists(line.split("\t")(0).contains) + } + .map{ line => + val chunks = line.split("\t") + // keep single spaces + val doc = chunks(3).split(" ").map(_.trim).filter(_!="").mkString(" ") + val (s, e) = getTargetIndices(doc, chunks(2)) + Datapoint(doc.map(_.toLower), + chunks(2).toLowerCase.trim, + chunks(4).split(" ")(0).trim, // take Affirmed or Negated + s, e) + }.toSeq.toDF + + dataframe + } + +} + +case class Datapoint(sentence: String, target: String, label: String, start:Int, end:Int) From aa195e682464554d3cc2602b4d847c607cb11b48 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 11:26:03 -0300 Subject: [PATCH 38/55] added jupyter notebook for assertion status --- .../example/logreg-assertion/assertion.ipynb | 200 +++++++++--------- python/sparknlp/__init__.py | 1 + python/sparknlp/annotator.py | 70 ++++++ 3 files changed, 167 insertions(+), 104 deletions(-) diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb index b78bf9bccd3461..b57d3b4168bc37 100644 --- a/python/example/logreg-assertion/assertion.ipynb +++ b/python/example/logreg-assertion/assertion.ipynb @@ -14,7 +14,7 @@ "\n", "from sparknlp.annotator import *\n", "from sparknlp.common import *\n", - "from sparknlp.base import *" + "from sparknlp.base import *\n" ] }, { @@ -22,18 +22,6 @@ "execution_count": 2, "metadata": {}, "outputs": [], - "source": [ - "# Paths to i2b2 dataset\n", - "i2b2Dir = '/home/jose/Downloads/i2b2/concept_assertion_relation_training_data'\n", - "trainPaths = [i2b2Dir + '/concept_assertion_relation_training_data/partners',\n", - " i2b2Dir + '/concept_assertion_relation_training_data/beth']\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "spark = SparkSession.builder \\\n", " .appName(\"assertion-status\")\\\n", @@ -46,47 +34,36 @@ ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ - "1. i2b2 dataset needs registration prior to download.\n", - "2. training data is spread accross two directories, 'beth' and 'partners'." + "1. required imports.\n", + "2. create spark session." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'tokenizer' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mdocumentAssembler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0msentenceDetector\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mposTagger\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mnerTagger\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'tokenizer' is not defined" - ] - } - ], + "outputs": [], "source": [ "import time\n", "\n", - "\n", - "embeddingsFile = \n", + "# TODO: fix this hard-coded path\n", + "embeddingsFile = '/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin'\n", "\n", "documentAssembler = DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", + " .setInputCol(\"sentence\")\\\n", + " .setOutputCol(\"document\")\\\n", + "\n", + "assertion = AssertionLogRegApproach()\\\n", + " .setLabelCol(\"label\")\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"assertion\")\\\n", + " .setBefore(11)\\\n", + " .setAfter(13)\\\n", + " .setEmbeddingsSource(embeddingsFile,200,3)\n", "\n", - "assertion = AssertionStatusApproach()\\\n", - " .setLabelCol(\"label\") \n", - " .setInputCols(\"document\") \n", - " .setOutputCol(\"assertion\") \n", - " .setBefore(11) # set window parameters\n", - " .setAfter(13)\n", - " .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary)\n", "\n", "finisher = Finisher() \\\n", " .setInputCols([\"assertion\"]) \\\n", @@ -95,45 +72,44 @@ "pipeline = Pipeline(\n", " stages = [\n", " documentAssembler,\n", - " sentenceDetector,\n", - " posTagger,\n", + " assertion,\n", " finisher\n", " ])\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+------+---------+--------------------+\n", - "|itemid|sentiment| text|\n", - "+------+---------+--------------------+\n", - "| 1| 0| ...|\n", - "| 2| 0| ...|\n", - "| 3| 1| omg...|\n", - "| 4| 0| .. Omga...|\n", - "| 5| 0| i think ...|\n", - "| 6| 0| or i jus...|\n", - "| 7| 1| Juuuuuuuuu...|\n", - "| 8| 0| Sunny Agai...|\n", - "| 9| 1| handed in m...|\n", - "| 10| 1| hmmmm.... i...|\n", - "| 11| 0| I must thin...|\n", - "| 12| 1| thanks to a...|\n", - "| 13| 0| this weeken...|\n", - "| 14| 0| jb isnt show...|\n", - "| 15| 0| ok thats it ...|\n", - "| 16| 0| <-------- ...|\n", - "| 17| 0| awhhe man.......|\n", - "| 18| 1| Feeling stran...|\n", - "| 19| 0| HUGE roll of ...|\n", - "| 20| 0| I just cut my...|\n", - "+------+---------+--------------------+\n", + "+--------------------+--------------------+--------+-----+---+\n", + "| sentence| target| label|start|end|\n", + "+--------------------+--------------------+--------+-----+---+\n", + "|**initials ______...|multinodular goit...|Affirmed| 21| 25|\n", + "|02) mild aortic r...|mild aortic regur...|Affirmed| 1| 3|\n", + "|02) mild left atr...|mild left atrial ...|Affirmed| 1| 4|\n", + "|02) mild left atr...|mild left atrial ...|Affirmed| 1| 4|\n", + "|02) mild to moder...|mild to moderate ...|Affirmed| 1| 5|\n", + "|02) mild to moder...|mild to moderate ...|Affirmed| 1| 5|\n", + "|02) no valvular a...|valvular abnormal...| Negated| 2| 3|\n", + "|02) nondilated ri...|nondilated right ...|Affirmed| 1| 9|\n", + "|02) normal left v...|normal left ventr...|Affirmed| 1| 4|\n", + "|02) normal left v...|normal left ventr...|Affirmed| 1| 6|\n", + "|02) paradoxical s...|post-operative se...|Affirmed| 6| 8|\n", + "|02) small left ve...|small left ventri...|Affirmed| 1| 8|\n", + "|03) mild mitral r...|mild mitral regur...|Affirmed| 1| 3|\n", + "|03) mitral annula...|mitral annular ca...|Affirmed| 1| 3|\n", + "|03) moderate left...|moderate left atr...|Affirmed| 1| 4|\n", + "|03) normal pulmon...|normal pulmonary ...|Affirmed| 1| 5|\n", + "|03) thickened aor...|thickened aortic ...|Affirmed| 1| 3|\n", + "|03) thickened aor...|thickened aortic ...|Affirmed| 1| 6|\n", + "|03) thickened aor...|thickened aortic ...|Affirmed| 1| 8|\n", + "|03) thickened mit...|thickened mitral ...|Affirmed| 1| 6|\n", + "+--------------------+--------------------+--------+-----+---+\n", "only showing top 20 rows\n", "\n" ] @@ -143,8 +119,8 @@ "#Load the input data to be annotated\n", "data = spark. \\\n", " read. \\\n", - " parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n", - " limit(1000)\n", + " parquet(\"../../../src/test/resources/negex.parquet\"). \\\n", + " limit(3000)\n", "data.cache()\n", "data.count()\n", "data.show()" @@ -152,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": { "scrolled": false }, @@ -174,55 +150,71 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+------+---------+--------------------+--------------------+\n", - "|itemid|sentiment| text| finished_ner|\n", - "+------+---------+--------------------+--------------------+\n", - "| 1| 0| ...|word->is#result->...|\n", - "| 2| 0| ...|word->I#result->O...|\n", - "| 3| 1| omg...|word->omg#result-...|\n", - "| 4| 0| .. Omga...|word->..#result->...|\n", - "| 5| 0| i think ...|word->i#result->O...|\n", - "| 6| 0| or i jus...|word->or#result->...|\n", - "| 7| 1| Juuuuuuuuu...|word->Juuuuuuuuuu...|\n", - "| 8| 0| Sunny Agai...|word->Sunny#resul...|\n", - "| 9| 1| handed in m...|word->handed#resu...|\n", - "| 10| 1| hmmmm.... i...|word->i#result->O...|\n", - "| 11| 0| I must thin...|word->I#result->O...|\n", - "| 12| 1| thanks to a...|word->thanks#resu...|\n", - "| 13| 0| this weeken...|word->this#result...|\n", - "| 14| 0| jb isnt show...|word->jb#result->...|\n", - "| 15| 0| ok thats it ...|word->ok#result->...|\n", - "| 16| 0| <-------- ...|word-><-------...|\n", - "| 17| 0| awhhe man.......|word->awhhe#resul...|\n", - "| 18| 1| Feeling stran...|word->Feeling#res...|\n", - "| 19| 0| HUGE roll of ...|word->HUGE#result...|\n", - "| 20| 0| I just cut my...|word->I#result->O...|\n", - "+------+---------+--------------------+--------------------+\n", + "+--------------------+--------------------+------------------+\n", + "| sentence| target|finished_assertion|\n", + "+--------------------+--------------------+------------------+\n", + "|**initials ______...|multinodular goit...| result->Affirmed|\n", + "|02) mild aortic r...|mild aortic regur...| result->Affirmed|\n", + "|02) mild left atr...|mild left atrial ...| result->Affirmed|\n", + "|02) mild left atr...|mild left atrial ...| result->Affirmed|\n", + "|02) mild to moder...|mild to moderate ...| result->Affirmed|\n", + "|02) mild to moder...|mild to moderate ...| result->Affirmed|\n", + "|02) no valvular a...|valvular abnormal...| result->Negated|\n", + "|02) nondilated ri...|nondilated right ...| result->Affirmed|\n", + "|02) normal left v...|normal left ventr...| result->Affirmed|\n", + "|02) normal left v...|normal left ventr...| result->Affirmed|\n", + "|02) paradoxical s...|post-operative se...| result->Affirmed|\n", + "|02) small left ve...|small left ventri...| result->Affirmed|\n", + "|03) mild mitral r...|mild mitral regur...| result->Affirmed|\n", + "|03) mitral annula...|mitral annular ca...| result->Affirmed|\n", + "|03) moderate left...|moderate left atr...| result->Affirmed|\n", + "|03) normal pulmon...|normal pulmonary ...| result->Affirmed|\n", + "|03) thickened aor...|thickened aortic ...| result->Affirmed|\n", + "|03) thickened aor...|thickened aortic ...| result->Affirmed|\n", + "|03) thickened aor...|thickened aortic ...| result->Affirmed|\n", + "|03) thickened mit...|thickened mitral ...| result->Affirmed|\n", + "+--------------------+--------------------+------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ - "ner_data = model.transform(data)\n", - "ner_data.show()" + "result = model.transform(data)\n", + "result.select(\"sentence\", \"target\", \"finished_assertion\").show()" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "Py4JJavaError", + "evalue": "An error occurred while calling o286.save.\n: scala.NotImplementedError: The default jsonEncode only supports string and vector. org.apache.spark.ml.param.Param must override jsonEncode for java.lang.Integer.\n\tat org.apache.spark.ml.param.Param.jsonEncode(params.scala:98)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:296)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:295)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.AbstractTraversable.map(Traversable.scala:104)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.getMetadataToSave(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:277)\n\tat org.apache.spark.ml.util.DefaultParamsWriter.saveImpl(ReadWrite.scala:250)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:254)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:253)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.saveImpl(Pipeline.scala:253)\n\tat org.apache.spark.ml.Pipeline$PipelineWriter.saveImpl(Pipeline.scala:205)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./ner_pipeline\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./ner_model\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/ml/util.pyc\u001b[0m in \u001b[0;36msave\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"path should be a basestring, got type %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0moverwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1131\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1132\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1133\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1134\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/sql/utils.pyc\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/protocol.pyc\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 317\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m raise Py4JError(\n", + "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o286.save.\n: scala.NotImplementedError: The default jsonEncode only supports string and vector. org.apache.spark.ml.param.Param must override jsonEncode for java.lang.Integer.\n\tat org.apache.spark.ml.param.Param.jsonEncode(params.scala:98)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:296)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:295)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.AbstractTraversable.map(Traversable.scala:104)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.getMetadataToSave(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:277)\n\tat org.apache.spark.ml.util.DefaultParamsWriter.saveImpl(ReadWrite.scala:250)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:254)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:253)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.saveImpl(Pipeline.scala:253)\n\tat org.apache.spark.ml.Pipeline$PipelineWriter.saveImpl(Pipeline.scala:205)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" + ] + } + ], "source": [ - "pipeline.write().overwrite().save(\"./ner_pipeline\")\n", - "model.write().overwrite().save(\"./ner_model\")" + "pipeline.write().overwrite().save(\"./assertion_pipeline\")\n", + "model.write().overwrite().save(\"./assertion_model\")" ] }, { @@ -236,8 +228,8 @@ "source": [ "from pyspark.ml import PipelineModel, Pipeline\n", "\n", - "Pipeline.read().load(\"./ner_pipeline\")\n", - "sameModel = PipelineModel.read().load(\"./ner_model\")" + "Pipeline.read().load(\"./assertion_pipeline\")\n", + "sameModel = PipelineModel.read().load(\"./assertion_model\")" ] } ], diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index ba1d64ec01055f..55c01748211431 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -6,6 +6,7 @@ sys.modules['com.johnsnowlabs.nlp.annotators.ner'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.ner.regex'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.ner.crf'] = annotator +sys.modules['com.johnsnowlabs.nlp.annotators.assertion.logreg'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.pos'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.pos.perceptron'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.sbd'] = annotator diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index 97b75ffa07b1aa..55a50fa10bc4a8 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -17,6 +17,7 @@ perceptron = sys.modules[__name__] ner = sys.modules[__name__] crf = sys.modules[__name__] +assertion = sys.modules[__name__] regex = sys.modules[__name__] sbd = sys.modules[__name__] sda = sys.modules[__name__] @@ -537,3 +538,72 @@ def __init__(self): class NerCrfModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties): name = "NerCrfModel" + +class AssertionLogRegApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, AnnotatorProperties, AnnotatorWithEmbeddings): + + label = Param(Params._dummy(), "label", "Column with one label per document", typeConverter=TypeConverters.toString) + # the document where we're extracting the assertion + document = Param(Params._dummy(), "document", "Column with the text to be analyzed", typeConverter=TypeConverters.toString) + target = Param(Params._dummy(), "target", "Column with the target to analyze", typeConverter=TypeConverters.toString) + maxIter = Param(Params._dummy(), "maxIter", "Max number of iterations for algorithm", TypeConverters.toInt) + regParam = Param(Params._dummy(), "regParam", "Regularization parameter", TypeConverters.toFloat) + eNetParam = Param(Params._dummy(), "eNetParam", "Elastic net parameter", TypeConverters.toFloat) + beforeParam = Param(Params._dummy(), "beforeParam", "Length of the context before the target", TypeConverters.toInt) + afterParam = Param(Params._dummy(), "afterParam", "Length of the context after the target", TypeConverters.toInt) + + + def setLabelCol(self, label): + self._set(label = label) + return self + + def setDocumentCol(self, doc): + self._set(document = doc) + return self + + def setTargetCol(self, t): + self._set(target = t) + return self + + def setMaxIter(self, maxiter): + self._set(maxIter = maxiter) + return self + + def setReg(self, lamda): + self._set(regParam = lamda) + return self + + def setEnet(self, enet): + self._set(eNetParam = enet) + return self + + def setBefore(self, before): + self._set(beforeParam = before) + return self + + def setAfter(self, after): + self._set(afterParam = after) + return self + + def _create_model(self, java_model): + return AssertionLogRegModel(java_model) + + @keyword_only + def __init__(self): + super(AssertionLogRegApproach, self).__init__() + self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach", self.uid) + self._setDefault(label = "label", + document = "document", + target = "target", + maxIter = 26, + regParam = 0.00192, + eNetParam = 0.9, + beforeParam = 10, + afterParam = 10) + + +class AssertionLogRegModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties): + name = "AssertionLogRegModel" + + + + From 51cdf094969a1cc805239e36dafc0afa1e812a80 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 11:27:50 -0300 Subject: [PATCH 39/55] some changes to make parameter names match in notebook --- .../logreg/AssertionLogRegApproach.scala | 39 ++++++++++++------- .../logreg/AssertionLogRegModel.scala | 12 +++++- .../assertion/logreg/Windowing.scala | 4 +- .../AnnotatorWithWordEmbeddings.scala | 1 - .../ml/logreg/I2b2DatasetPipelineTest.scala | 2 +- 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 4dcdc263d8726a..0dffafce0bb2db 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -5,10 +5,13 @@ import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.param.Param +import org.apache.spark.ml.param.{IntParam, Param} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ +import scala.collection.mutable + /** * Created by jose on 22/11/17. */ @@ -26,29 +29,29 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac def this() = this(Identifiable.randomUID("ASSERTION")) // example of possible values, 'Negated', 'Affirmed', 'Historical' - val labelColumn = new Param[String](this, "label", "Column with one label per document") + val label = new Param[String](this, "label", "Column with one label per document") // the document where we're extracting the assertion - val documentColumn = new Param[String](this, "document", "Column with one label per document") + val document = new Param[String](this, "document", "Column with the text to be analyzed") // the target term, that must appear capitalized in the document, e.g., 'diabetes' - val targetColumn = new Param[String](this, "target", "Column with the target to analyze") + val target = new Param[String](this, "target", "Column with the target to analyze") val maxIter = new Param[Int](this, "maxIter", "Max number of iterations for algorithm") val regParam = new Param[Double](this, "regParam", "Regularization parameter") val eNetParam = new Param[Double](this, "eNetParam", "Elastic net parameter") - val beforeParam = new Param[Int](this, "before", "Length of the context before the target") - val afterParam = new Param[Int](this, "after", "Length of the context after the target") + val beforeParam = new IntParam(this, "beforeParam", "Length of the context before the target") + val afterParam = new IntParam(this, "afterParam", "Length of the context after the target") - def setLabelCol(label: String) = set(labelColumn, label) - def setDocumentCol(document: String) = set(documentColumn, document) - def setTargetCol(target: String) = set(targetColumn, target) + def setLabelCol(label: String) = set(label, label) + def setDocumentCol(document: String) = set(document, document) + def setTargetCol(target: String) = set(target, target) def setMaxIter(max: Int) = set(maxIter, max) def setReg(lambda: Double) = set(regParam, lambda) def setEnet(enet: Double) = set(eNetParam, enet) - def setBefore(before: Int) = set(beforeParam, before) - def setAfter(after: Int) = set(afterParam, after) + def setBefore(b: Int) = set(beforeParam, b) + def setAfter(a: Int) = set(afterParam, a) - setDefault(labelColumn -> "label", - documentColumn -> "document", - targetColumn -> "target", + setDefault(label -> "label", + document -> "document", + target -> "target", maxIter -> 26, regParam -> 0.00192, eNetParam -> 0.9, @@ -56,11 +59,17 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac afterParam -> 10 ) + /* send this to common place */ + def extractTextUdf = udf { document:mutable.WrappedArray[GenericRowWithSchema] => + document.head.getString(3) + } + override def train(dataset: Dataset[_]): AssertionLogRegModel = { import dataset.sqlContext.implicits._ /* apply UDF to fix the length of each document */ val processed = dataset.toDF. + withColumn("text", extractTextUdf($"document")). withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) val lr = new LogisticRegression() @@ -68,7 +77,7 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac .setRegParam(getOrDefault(regParam)) .setElasticNetParam(getOrDefault(eNetParam)) - val labelCol = getOrDefault(labelColumn) + val labelCol = getOrDefault(label) /* infer labels and assign a number to each */ val labelMappings: Map[String, Double] = dataset.select(labelCol).distinct.collect diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 70df2333b6b705..ff28830c552fdc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -8,9 +8,11 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.hadoop.fs.Path import org.apache.spark.ml.param.Param +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions.udf import scala.collection.immutable.Map +import scala.collection.mutable /** @@ -22,8 +24,8 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS with Windowing { /* remove these Params */ - val beforeParam = new Param[Int](this, "before", "Length of the context before the target") - val afterParam = new Param[Int](this, "after", "Length of the context after the target") + val beforeParam = new Param[Int](this, "beforeParam", "Length of the context before the target") + val afterParam = new Param[Int](this, "afterParam", "Length of the context after the target") override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) setDefault( @@ -47,6 +49,7 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS /* apply UDF to fix the length of each document */ val processed = dataset.toDF. + withColumn("text", extractTextUdf($"document")). withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) model.get.transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) @@ -83,6 +86,11 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS } override def write: MLWriter = new AssertionLogRegModel.AssertionModelWriter(this, super.write) + + /* send this to common place */ + def extractTextUdf = udf { document:mutable.WrappedArray[GenericRowWithSchema] => + document.head.getString(3) + } } object AssertionLogRegModel extends DefaultParamsReadable[AssertionLogRegModel] { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 82f74cf1a48b7e..14bc00b285d208 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -56,7 +56,7 @@ trait Windowing extends Serializable { applyWindow(doc, start, end) } - + /* TODO targetTerm never used */ def applyWindow(wvectors: WordEmbeddings) (doc:String, targetTerm:String, s:Int, e:Int) : Array[Double] = { val tokens = doc.split(" ").filter(_!="") @@ -84,7 +84,7 @@ trait Windowing extends Serializable { } def applyWindowUdf = - //here 's' and 'e' are token number for start and end of target when split on " " + //here 's' and 'e' are token numbers for start and end of target when split on " " udf { (doc:String, targetTerm:String, s:Int, e:Int) => Vectors.dense(applyWindow(wordVectors.get)(doc, targetTerm, s, e)) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala index a61c169cc73c70..8a5ccd0bb7cff8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AnnotatorWithWordEmbeddings.scala @@ -7,7 +7,6 @@ import java.util.UUID import com.johnsnowlabs.nlp.{AnnotatorApproach, BaseAnnotatorModel} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext -import org.apache.spark.ml.Model import org.apache.spark.ml.param.{IntParam, Param} import org.apache.spark.sql.SparkSession diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala index 85e3203190b075..4fd629d1b87029 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2DatasetPipelineTest.scala @@ -2,7 +2,7 @@ package com.johnsnowlabs.ml.logreg import com.johnsnowlabs.ml.common.EvaluationMetrics import com.johnsnowlabs.nlp.{Annotation, DocumentAssembler} -import com.johnsnowlabs.nlp.annotators.assertion.logreg.{AssertionLogRegApproach, AssertionLogRegModel} +import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.{Row, SparkSession} From 6e640c4dfed458a9d21e43a692851dd4172d72b2 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 11:29:38 -0300 Subject: [PATCH 40/55] added parquet version of negex dataset for notebook --- src/test/resources/negex.parquet/._SUCCESS.crc | Bin 0 -> 8 bytes ...47-499a-8ddd-7a77eec21916.snappy.parquet.crc | Bin 0 -> 536 bytes src/test/resources/negex.parquet/_SUCCESS | 0 ...0-a047-499a-8ddd-7a77eec21916.snappy.parquet | Bin 0 -> 67165 bytes 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/test/resources/negex.parquet/._SUCCESS.crc create mode 100644 src/test/resources/negex.parquet/.part-00000-ed6b9220-a047-499a-8ddd-7a77eec21916.snappy.parquet.crc create mode 100644 src/test/resources/negex.parquet/_SUCCESS create mode 100644 src/test/resources/negex.parquet/part-00000-ed6b9220-a047-499a-8ddd-7a77eec21916.snappy.parquet diff --git a/src/test/resources/negex.parquet/._SUCCESS.crc b/src/test/resources/negex.parquet/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/src/test/resources/negex.parquet/.part-00000-ed6b9220-a047-499a-8ddd-7a77eec21916.snappy.parquet.crc b/src/test/resources/negex.parquet/.part-00000-ed6b9220-a047-499a-8ddd-7a77eec21916.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..aa095d0b2fb4a5c04c0cc912cfbabab2dc99dbaf GIT binary patch literal 536 zcmV+z0_Xi>a$^7h00IC|Mr~6pKm2%9%O^PbW_i{@Fm0mN=In6`s9)|smtR90TZF21 z2+Rr#ahBW~LJgaW<7JMU?=oc;rx)*sC|b(jiiG*#_6c-a3YosRGTMTvo~l<@Wsir8 zaf+8?e-5KS?FodJL;)r0mhD{>10Z{--yEUYAF!Y8Y!%aptkE*tlUw!u?|`Yhk?8({C=Cjk_=O_>Yy3Uz!|?NCTGzS}}|VpV%2 zEsVy3=K~{#>;)Ka#Syq~S_D`&H~HjX2>mqT+F3P!DTaUPiIQK+=bP zE#+SaXG4MB?2T`FN1?I9$I-Bsu{lH>)V71nE>yAFYsEnCD!0=XRr$< zxOw%e7{+i#9E*Tp)TJZCU?Gt89MmDXsLy%TnA~GqeN}hjJx;N^&1|Og<){=<<(_yVt)``b zl z6Vd*npR3uzr`64hCTRANxq9ECq@0o?iA`!4pEj)|Gx#Bv(41xoC*ru+8JkCQaa}Tz z&Ug}d6MV~lCEy{XN57CyKWl(drS zS0Yk6m(C=jYPjeT$2u})KGStJ)6Qt@jy;?lk2^Fac>l24t%f_D#KGLpIBNe`=N?BU z>s_AAv}86KOT^_AzFrEqDOYq0^PnZ2#cNB&`Y9!uUNzjO#FeFcgZZ2yP3bQq4qN+2 z{&&Wa={GhI?QId!2T3tSyW@AdblaD6k zi0dm-H$QLP6AZEIFLiZoe`Jn{wNU+ks({&PZx}gQ9+F+ufMRW+bIw zjVST3A|<*#UN)1^{!LV-t|ly}FkQk@x;Goobnd`=G1&z>#V1vl4jeKh5*aBn_;1j!yiMvoFLv39V~f$g?p8*5n{h0D zW?kVp#Al$3qCMIQk=xQui_-PEUru6mTSLAq)k{*hZ zfzb=ZJkEcVE0M8~V2O!Rj;qFmjwS}AD5hAHxl-RJR~hQ1$x=Pj#wJplLO$M;>9t+W ztTZW`@Rl(_udTcQ3sxhRE~&BxY&w&t7E4(z{XZ>UvRg@|)Oe5Y0-Y({i=`~iyf!1z zErn8wj3qELkidc)3lcUFw{2v@y&gZtcTbKU)s^4|49K_%`_JmdoW-Y#sNI;iSWWR= zvyq&%4e#1?&&9-$i-!|QVU4yVoQOu{WLi0ICsF2lPfnmWYgsk@+Jx#-hLEXJFV>kv zN_&(ji|=8a7s!&^E%zv^af$pj>9wAmBBwkFEV(_nt5GF}+dsUUlcnUyOfIHGnD5Ms zf-@+mt-u=FjTxz=Q*a`W>s9+lcX7R`#Na-x5Q&r=R>O;Lw#acMo64n69@LkYr2E5C zA{>r(zQt&CkTThjVoU5i!m4oqj>TTt$w$>#h$;3>XU%m~)pOVr_TFykR+I?lsqIpX zkfArHVRyiY&i3?58>A$r?LWJipXRXS?SF1!>TeQ7!_GPFPrH`xuj6pH=`+TT^{nf_ z(K~i7^-U*pLhN;&-X}DH$*gq;$HpSewl3p%eDu;_HAZ`{g3UP*lEWi!R3>xLL`ubU zmNV2{Kr-+!$4P28T%{d3z+%ST(1^jDNyODnQe91~N7@KygIAouDn0Fc$W&&JDUMIA zL^iu8Fih*iEU9LW-F}q)=Y6K>{c7xD*suC!L@&xO^SFqb9&vhpI35VT!pc#%$g};5 z>pzAB7V%z~ncw~gLPXNPu1eJqBXkxr`!w~~oyE?FC|z}JYUF=o;VbZUhzZOB7R z5hWc~;@iIwwrY6{aR0hh!qwp^#OQ`dhIO@4D9dqE=3KGh}f&9MRHt?&XKH?5pIF ze{^#m(l$L5m@-}*pJxaw;cP5;Y|9+RTc_cZDK!RfeWE5w5}h-7 zyh^?Kd-DB1-;-6}6M6yXd8+jpUUccaSCJfTIZlYr`|<=5*q~v${o^sry`M0o40oEb8aRGsqdCouBurxlGR15fd=l<6a9Gw_IPfNjuX!9f z@1N#z_|d}$986P-_k2=OErk$AB@>AhoDQa>^cFU+?r+)483JGAv+4BkZ(K@E!e@GD z&L4e?OJaj~-3UJ!V7Y;)kS|tets>q`Y}T7u$yc!vVfgSM27 zhTAKMeaAR9r3AH*CmOO*RjNMuiog`K&*S814mg9!E-k~mTYHUMQ%#jyG4kEm>_$F; zTSZeG3w0);8Jvbx0{KKdjBQsz=$@h7C7PSWm1e#>n$3VtM~1enB5*rBIR!F03qDtO zDiI58=Y&Z{`nE~1xMO%CvzzNqWZ{=mA_4vBe>7`gst<%hvh&S=l2w$W%%<9Gva!!Pf z9x0tb?pO`+0xqM(A(^M);k`U2AJ&>&&mrkUwlbg~PYbpIN8ts)?w5v*LR})&BggR) zJs3$+v+r~wj~e3$u@)qVt%1%jgBJ?Gq9^l9gCwxInfRIrsa|zAfhR4-b2}Cj>@CK> z$YfuR_`W9!x1+&jca)J7eXOi#b<=!rkkOCoePq2)Aw@oYSzs%poWE=vqg`H!MGXrz zE@V`x-v)0N6$S8cs_*n zSeP#u5c4Q0aB+0NA|5S+JBqk4lCispto0k>NM%CMSrq(|+AUJQn$9Ap-mK6E1lS3yA5i2zjlK>a8;N!lk*cI1EPIN}NmcqzHj`oQHm8CQI~No>j}($6p~=KG zt#wBsntcqw93)Yy1 z;Tc>&6m?ChSxWXQaYP!a(6QOfjXdfYQQ?7d%?0LaQvjbLqA(lDRNqQM0&sC5Y%SKS z(^OpBEaq%diK}6(_z*8DLAfWfDHl#0JGTAj(&hMM)Lm+dOOaHzr?ivI4)l;?wXvP- zq;#okRf-r8>~_nVr6m1Tv^0Zx?7f+gD>WPVbOQ73jC^^(gjFIPmf`u5k$)X;(9_;=TIMRoPe-xd!B1Ld7KsX{3v8mB9@1hfv-K@_p@dB*m~wh+sqhsS zT+Sv!;*h}jg+Xl{?gL6&**M}%In}rQ8&XJJ?Q~&0c^Hv?ijLH;4KsJ z9tqW)A&x`EBzOa9Gq!o39Shfp)QbcVddoPZkMw|3ILMKBu!r8FlLv>4bi~fR!Vs-G zSjA?0#JdWxayYeGe6K7RY0F^^iD+`dQp4a;$272+Sv5glFXm7zf}uQ84N0RRsiHIl z_ENrER-=Pc9mCLVBE7;l96dLW36a#`y=*uKS|eO6GA$%5T%x5dRvrH( z<;UKl^}u0H!v)SgSF>sqLgluloRU<9enCQx91Ed-wD~x(eF0Xd{0b9;3~wAp%?Y)U z2uw1XaIuhH30rmE`QtRtI&|AtQ_m(xc zTZ?u6c?^BBYz*3-XF!G`W#OgLK(@54a zJ%+dxOR2*Bc_bmc#veN}bd_}<{@ZpDQEm5aU*Q_H{ny^5)xN+dqDolUU&*6TCS!q{ zs0lgFCpt|Yu6v$^NBtqH0XCz0BEb(zka_kX8Hy{jHk?()_A1Jw^e;V>yXbbCgfc10 zgK9h#%&|TX^j492&DC-W#S>{0ljM>~8L}evXsye5QyR6@a4+}(B48SjQ{8fE$S5UH z!@!FtHrejot{Yv=CbDNeVZh3Ph}Ekp6Xuq8>fm}CMrj7VEMwKM_D?>AjE8>2lXZiZ zQI=rqWA61E5onA;tCQmmty%B-Zj$xNt&69A4Z;sfq144K9aaz zIU)pfNCA;>ZCuGB9{SNjdamoV`NmX(OY6L=YQOZf-e10)O$RXd)^&5)1K0~H7;4k1Nudna!Y9i*07q}ZS{pQKOzg5fEpqwr3Zc(Id1&2GnqmZnDJnEo1ivCI2V zHHRb+9L^EV7?%}#%HVsb4vmQbeQG4G_IF6_Q+Kh?G(7lK^j4+ z5MR}Lt%UW}n62^|%PKhew3p0C+%{EXrS(cUgHRIiUp+ir2$cn3t1zXdXYj$`c~j>v zn$ryNoHU@=J|hzSUXM+vrT`TTBtN5vFdQLO!e`@=w0((wkcA{A$)ZApNlG8R!)eH2 zFU!JP`6ldB2&zY1hPa$X4sA5C{Rz0bl8yI3u`qnA32QPbVSj*3%OcJAf zZSWkyln0Gw-*seBZA~o}%bt{i_@Z0xMR}!HQQX343F^LtU#nY=`qD0!)qgWAj&NPr}?KF}a`48^dm*pffbf9&G zi&R@Lv+*c;b{p5Ah?I)k%8{0TRiXJ4snlI|dZLz%D8cuKAB71XWFt7PiOAY2HkBK> z(;J=0N%_R~oe7nB>_+0pW!MgdpWR%)oECgFV9p;rLDJ&aMfz_?T5a1n)G^`LGNdLl zb2YDQQc{|$h|9dDv{Rg>Ba-m_6bbQ9syB!8u48Yu_LFrU0^tG+PPJ0_Cpd#kWkbwF z2?;?I)jp}bV-u+fplTh*=~xIcLQc~|WLl!d*U8B6mw}||h=&2VR*&fIt11I3acWY2 zlEajPoi%oQ9w}nh8*sQ2N)^4s$>A^*)gUJQOz*O?0ZFjX(F&5a%@&X9ki2N^^F4ej zixu=Pfs#OU z=M6;+DD%dU;Oa-3ycZDVMGNvtoM>g*wHN@m8F8bA7g=zgf(sHmW=4g3=;7fw0<_tM z;@@Way4EorUY^q7AY4y2E-&UX(!k&#^Y+k(wA;XXdYK0y$aORjOByFn$U;uu#)VTV z^X*8s#YM0yeIa(-&oV12a0-Kxk0MKhjy=N5Rhqq;Q{sAuV>MJ5P%T0puZZI~c0+>s zr1l8DM7EyZYhiZcWB_H;IEt3AynuwXD`;GY#eWH#O~RX$DB^!Sn5xGpA4bM`>{eEd zDfbOklEB)GlntZEj1RP!IiRmbp!tIzb{m9JjtL^Ag%X`N!Qku-IWJ-3f|8dl%Uqsu!-cVTPC(1 zNmnr@s)B_hIc-35jQ*e%bT->T#%rAewhl%Sy4|AJ{m0+x~H+-&)#YY$cO}ABacHB&>{coAWS; zS3!gMD!k&kpufr*7wEb?yvl}b4}93d_sB@Sofb_2G`Fz*ve4p(PaDRleG=Zc!^YxP z8_qJ+cprVbbX@@@$29+n@0A87Q_&^3a)AoMg>p$4C-FFi>xLH$)>Wv0BvWJ2*syj- zvA1eGyr+c|b@UD`Z8qAL`UVSyOI(91;lg|_mduKO=@8j;e%Ep*S1E@Ohe}Em1r?#5 z=TOas33*r=9-hYIuuirA0o*24r6mT0h16~%9_H7Shgc6z*VmFem|A+DeketIEUXgO zj3EAv_QS9rGwUw_i{b(&o04H~gT3D-UV+jpiUPhmo0VRXFML{VpOZ($Ny6=J1J3ah z$E@nc)_jyo>MX?{&8!;8%EM=8UFawoE^bfPkB%EWU<17XwqNwxDFF~4_ zK(Rxiuaq&bBV&h$ zc)2rsFu}~ zgroExWQKW~`L@4lJx{buzC}gN7fTiGEjR0Z10sp9TjPMATiJ*T_m;u^ZImg2S4>o2 zkW+$T@lnC<$?KBcWQOL_eG7?cY~+k#*H}#YV563kd(t#ecK;Pbc){9r85wy}Zv3j; z{-6#f8tBn{ssZ?ppW`6ndph`93ZAPx#O9>jbDC-3EAE~~=m#Jl1hE7T>rq^HZnGk+ z5zcpRA5;dqYFzEfQ3fgk!63X{C){YnK`^;bGc&};o2qfxB89`@wKyc}LD-5lbhsHVnCd<@NjPUxlbP-|GaXzeoDICp(B}o_ z_ad+N6LS1#wnr6Tx58W-_M&vs8gd~v_=WTM;EJhiZPQ4IGzh<#og0g2Kw!HSXm!IH z5gsucA~~E{4sl4^N56&(v;s^UlOaP2^G?w&`e-G+pir*^D>!y`%_`DoWFZI`zAiI*gX{lNM zdFL%dV{+y*(>lz8rt-iDETO`pCVDQ{dJ&>{G1J6cqpchpVd{fx*<6~wRzO87p;{w7 zVnPUFz1Ym(g((U9rV9?6A%IUCh98Wh&oK+d!4gzw#pB}qE6F4<=b=Q+-R zNfs-(Is2Wkr@%Xx*!FlXvKH6|oha}3tME*z{Yf+N1!@^@BWdsClxRk-rqqf-b+&~= zYZ7$_*To6rhnFy+@=JQ6*|6M`h>H`o@Ld@$B&u zsR>S>6O7T*fw_)Hl`=!C^qK&CW#w`IuL(R!JPq)BBYaVAdz4;owm(wgjG1@RkxC0< zFdVCII^m-C8=2skc#;zKRkPg*c(0mBC!NC6ocKHr=!-XIm{H{(vneCtc4FsQ}3JCF)^Fprnf zZbF~3z`X)1C!ouSZ0p{c^t(J*P$Ay0hZS0XIT03@2^Cho1J0PV%t7=v_@#!iS%000 zW-i%)vT!IQ=Npi==49dAR=T%Xc(#VL1XXcfA!do_mM`^E!ra091RqY)*4fq+{Xr%L zS_vPJd{szQeY{88)G|cqOGTT92xMk^FD3pu^u45t-Xg87JH}=sMOMb$lxH8+^Hn}6 zDITlV-Ucg*Ta!dv%(T_E!8Kn0Bw}k6E-V@xTY&Dlr04i#;%~i<2%if2Wx{?PcF77L z(LkFQ{wVTxli|CCUpipItaaaav^0>f3T*koo7iM8y~JmJ0dK9p5VjYyO0HdlXfTzE z!hS2SpewPchy(&fgcLCEF+P6{f#3C50%b-#$1E(eZ9j+ucwv##c8|bx!F~CtR&Ldg zmasLo@V-FL%X1WXHx<#RJBEIRK--XRedK;Cs*p3V#?#%KmqA;DxchWL`#WW__qYe>IOT z90@wT9O4@I2i0SAA1V<#lwZ1AI0?{Hsu{x4-fX4=Zk$SoQS-g0L3pl)W^~MDMN?qW z!qR!ptwr9O3!5!39rB+TH+Y}7ov<0@ctaN&Br9R66AtA}jH(GYOn+3^b9V7_y0zA#=e_8e`O6=;nK}Ls_An;gWouamA9zl&LGv{X9i@Z_BpcUqSy0 zK;JT)hy7bhAD;|iEo@k#y%E_>gm>q=ab{u4vl}O&Cn?5UgHEy@T4Lj)Qg;+iO(WIf z=nRzGasu)rr8gIWskSj>dUz2eM3_}9o;#U@ZDH6k6@ju9xlIn`6?o0WIEB;ol(nNA zW|%k9g+e`c&K!MND?Fk5-R^Zc6FxY1^mFB6l4$!?(4T+Y#t)>_G?Y|=JRL>g_kn&1t{^k3cCh|6K`E%f(jceD zY}O;%UlEEWtZ#csz;=GA0T${3bnsy#HW8!>aLaPdh1iU9sI{ncPCbdZU+SQ@&m3dz zlk<6W+)2VO*yL=(yst)z(O(y%bvLUXNC-5Ouxnjjk6AL_j9utAU@YQ3FDy3( z--RQbrbD|}eB291@@Q*m>9NuEBsb?q3p8?YYEg+C%Z2g98fo_|RBEApRf#kO?|_mK zd}d&s5=;_FLhG>9NJe)3jVOE@Qt%Kc*nP`HfO97NGxs;@z?9{g8}QjU)HRtd~hD4o*zI1Ess zlPBkBuVGUOx}}Np^ec`v!Zc&BTxf2<*+_`yRXunUsT42aIkc=h_OKZjs+l#M;*K}8Po2MIVkX;{i4Kx^Ts{6 zVb>!7q+d2TCtAo<_wBWT%`>oUc&f;NJHZKW<_Y~JgEz3sfM%udLm%n$T~3g(rTZkf z$vl=rRXVt^Y2;d5pxSwCI6-wKnD_e&tZ$lZHw1_6)61;&(y=4V6(lKCTHzJ5Va$Yk zr#TV4xujbeKm&g+N&iG$3}46Yx7Pa>agD-*A;v0uR9Ef@H3@+6F#!bHV0YJ{fAI3w*(1Owz2-`ylO55fIM zK}hrdU#7wZ0)5aVEYadl@6E#-%W#%OxWGd<&VYmE!edk6uR?mI1yqh!Hd~|a!yn*G zDS;ZP8efc}R4j+Nkci@NzZ>$ZSV_a&4jXXdnooy31~d<$L-`YN0td-z)HEgffzjDf zit~)DSUeLy+4Z7HMJ9qqyTt`?gOfw6>X67Kkt@}<3a9uHw&QDj882w@jW*>8iBgS1 zi5Iq^QV^k^SRiio$Iv+iS3BrAoZmv2dU&Xmq~*EJS}%N^=SSBBV$I$JeHh2EttCd) zpt+EQ?Ex=r2XyCi>0}n3m^(HHvi@9%+b4E8(8^WlD`#LT2Q@R13gnl8dlfU^dP4wh z4lSWpvQ-~6I@`a119`*)i5e1d&YF&+vyF{t^nuirQ5&`l;e-)AcxmXEa?%(42Igpm z3mf2;g*J~6FY0=;gxpkgPVgcO<9?XDg5+S6jSg3m%=Pc;pW9lUL3kpu0k!2)yrbn(X$#>a*g%~RlM{Ga4j)ktdSZFRPJTtZ2kuNnH; z;idD;@~_T-8D1$C7q~{Y8>edDht+xL2Kvm$Dw763)THReD~>nFPYewU17QC+kABA6 zDq7A6!IOH7dxIV3M<6jaaDTh8eZBR;25iJu5=6hUyu)`59?76f8^#98n&q&)MtE=* z+&2y5HQ~91`IkqNvkWn5`y$P2Eb6=u3g`+>pn>CBR5lZ8Kq;eopI(3?9_A4~gN_q@ z0YabICD2s{beaK20U;F{+b|nX2{>Ftn$hPRwn0ajjy919)8n#hn#56^^)msD8)XD1 zmuCd%9!jeHi#lLmCmI-SE$Dbdi;wLH_44os==-t(+ez_klL#&A-W`9;#nx8AAr40p zPfSAwPag+O7xOCGB5WrYPc+{mGP8p>ld$-E4R{b;!|d=*D{KI|v}|naMvZJDK^?QG zRx6)*G!BEM`jq1%oEmsI-~+;w5g7qEy;&FBh=Liu#6&RHqnO!*@+X=Le#H8!bj$Vn zKxqT>fbe&|0nMKA^ze0aCJ{Jg(ytSBI&0HZ=Y?gC*HIHx;JleA&v_?Lfq%5LuM9(^ zWe%#AsRYi!^n;%)XfLO~=$K}@znC5`hHG)OhQ$9!go=^3nW4O%3 zB5(S47Q9r(;`9ckinr+&Ju1GijkDFmhDL^?uUeT|p8I*!UR2ocfb)!+pzUS%du84k z2#32m8Cczc(|!3M>V$HnUyW){cFHWyQ}jOZLh-7H{rBpFBam4SkIy6B+YhmFk8o=N zJgyBcQJ$q`lQeHJ&jmdse(V(Xl`O1PoreSHaha_%cTwI&clzLC=_M1j1pue>@nE&k zv5c9@Y;@l-!xPctyz*dGTX?;U{qc@mlc;R>f z-IafGnYIXQTyRgA_9S=~^7kfe7##QaX;)Y~CE-)23noOxWXMV7sTL(5DY zb~&(1CySoLS*SRwKl!S41;7L4+!U$CT2m!ltw(o5Pt#9W(B|@TH4|dC2n$WtlN@}j zgDWSGi5a~cU}qIMsn5tWg+Kx7_rrhNDy-iv!AE_`?3`{u!zMg4!=&JWh)h~|&1ln= zL=z~|p#6{LbEJXEvb8nP6C`O3vV|->QU-mi$GE$j8JgdU;@{2nY&MK%B(OeSGQilU zo0v~XO8f_-@~eaLK0``MX4CKD0WUlgC(hHca)chvGo)fTT7qjObY;L}yl|~yGzg!T zLTR4nCbnCKiBmA)PLIH*>5L6-FVNUVT&vm%uol5y$a*B`F~IwN>}*tGeiq+!LS>nI zO`~;bBmJJEY9qW=s%?ki0&JD^{c^jx4IZCwwYAZw4aTZpzGBn0ngpBn86Ut4oDp1i zDS-`cZ5vVTlWp(@h0~(Ka-NyNqa|H|1A3^RX&iv#Jn_)QJTtXycX8FVN2SuV!rZrOQb*t+PltCyk-yC?zwy#$fOA0#_~5 zic;iJ(fM5&$%404SXeCHSq3ImdxK+5%Oj|hP%M7_UFfYu5uxaF{ak-F2^>Y3Tx48T zk<2DlJky9OG`-7>1Juk=A6(N$*=7#UbI`k6;AtMq^%?bL@Nt3lUWf3k(RE{pu)KZl~{^Hz9jGE+kXS`!+x5K>Zg(tq=qel~}0nQ0uJIiYTwrWV)d zT`P6)W8Nvu8tWz9;zx_@&z6pjdKP<}C(mXU4(%NJ2)%#UgyIFm%j>%F6p0+q;i;N> zDHa}j6ouhlW*>)Vb?BZV4skdA++;|iH6cNVPh)n#idv_>UaYTV6nDv-q3y>`TEmsz zc#G{X9uxI9@ zZi#v`e2L2h#S)si>vhfzG`|Z~{?B zW}!(Acl=^nfk#SlZSZVkMwmMXK3#A7k*QT*7^zZU858njGu74Ejc{T)Q{$~@ch_^w z3rtz(RU|Vwk-lSu_Y82G4IcNkx4~tcC6dK72&L(rV{^TMHqCT&h9*amPNw0O(}=&l zlk|@EqX}>Aa;ynwZp2aYqB^E;s2J<4z>4-Gv*D_GO(~dz&i>ZHQEzXa^;r|Cw(qGq z_83B5PLe}l%qOySqY)d3KbtR_^7)XWq=dOT=V!A*KeupZhA>xKlVBBX*41rfvXg6z zm2v?o=Kjo55(&eq?(_$(p|J$b^Tdh_W>_y@0}<4WK>EBv3)`Nm|p#VWjLrc*4o z8{rd5a9&$T;{8PoeP=b?-|XZ>xIzby8{GF=NUihVwr1 zPjhQA91Soln4brI_FHFCKMzkY=TnJLlpb7c*B1iTwQPF(N8)Qb@uos?w!zki2b7Ri zKR5$ndmNSuFj|k}uE*Pu(Ig|mv$68rR=}d$ao+;AN79rCbGztHLAFf1bJD)C?wOTz2Yx95;w_qMJ(G2Z{w_?9`$|4w}Prp+3o zl$R#>xJQoBYZD}X)~kMC>~c|HJ`+4_KC+YJG5S2r)6MhcGsLw5_Mdh4&dfak|HCI5 zKU``MX4=Wa+x{dZ?Tkm}{7L-&*`&JTZ{qKmPp|DHvMMUsF^~;1AIE>^2lG###G_p ze4%MJ-3t@y5$xN-=$3*!A3EzQr`5>n@$y3&HuMpEe4% zZfD2{TO4d}Vr7nGgu_Bt%An4<@cUAFh2=Uj|E*sp`@5{PHIGR4OBa$x-?H(Z3Hnx8 zYj7ByzZ;xE(RYiubk={1V86ls_6qXgdBY4WwO{XxO@t{EV^T>N^c#l3U&*h zEd*9NbKYd6AD%kk$(+VwF} zg$~~K!{86LT3?sf*GK6o-hZ<|e01kr>_ZHJ|ZT8!y zVp@!=!=tqs6&~^W&;(ko&8JQg*6GoiLO->QtTbzdnj)sa=PU9FjDA~-r_*+qFi%82 zx^fkX3m*bPg_RtOR-9Tkmk|CA)2WtHEA;8bX-(GeCc|f81Df6Y6$B&LP1mk~`h2)J zVAp9yl_yT?)Hd&KBg|ayT(?#D*GzA05j&RS5w>&!B@{U-=s4@NkbVjJpf>+IkxcRJ zVK&;au%DBqf34#1pe+p8=+34w+||NnvUGW=|EjEeT)?w>tTV&@d2DYStaf*!C%D+o z*I1vHi8`cv?9s7xc$9EqISZGTnsB~1fIjmHoe}Tqi5L%ZDXp1Ldd1rLjFCRc**7&X zd)a6f#t8@&!op`Oz#8^gVd@HDp56PY4vGs&HgG)LT*c#+YN%%_X%zRj2ETwWn)k-g z{{+{T>^&QC)U=f*c36)@ZK(5r*6@A8U;<-sd^y z40n@MBIBeF4^qkC&_X6({L9JsI5Z1Wf9cpTd||`j?7?pwXTtIX*)&S6=N7=SMmViv z-GR9rT0OlFY#mb(9*g2Z8WQrq9T2lxGKoIB*jiD!>s6uB=3KW5wh2zflOcFJVMt?0K+2AcEH5SX_k=*U8#M@5i`_)tywxtYm zcw<(Tv#oZrbPvh5Jq zzUbl^aD@|gHsL#sofws$L)4Gb(7o08#Nw4AbOEf)L8BA%Gyx|nPri*mb8;c+vF>v+ z$53gKnOcO9isGGE0spK+)^k<{U3z*28p~iy0S=k$^>ji3c8-_K$LpELS+q3V$pYst(Gspgd-c1dfhp;?f(4}!4d)-<+&C0kmqWEzKGsXE)hql=F!z4%*+Mi#G z)9<6pS?RRk3~!=_jpdxzS%s5T7(J)A(KGcM|1)YVDjxZ?*#UQ!`c@E|)^Voo47Oi& z+=wX@soeux1LiXDmJa2^VBbg!R(s(w9}}_eo(vzMnk$Lvb)9#ccZ0!>L2-qIW0Ed2 zGf!|%hC^$?7ozVe%<7Tr;2rDWZnm#S^QQfaQu#Mw6$dNbs4&Mwp z#E+qul2vqPujWtNe}yCV(r-~79^HTm`ES!#C1bynAf1hj;mie`{qk+h8xSqB|Dk98 zkN&aJ>1q)6RSA!H5tEDCD}q}|NH_-{WID2p!?*>nz~DQu*92>t;Gkug1m6yxLQwO( zWIKX|^eLwh6~L{dt6ey(#bc-V8z(;JwE-xy4C9!~1~gtEtTw`G0Z4i2DW(Vh8$z{p zszr1Q!ev?*bs!^?i<|V?_)z%4Boj)S0J9{HJsQu2dYpm|gv@NmSgL;a1=p3e) zM1&XTF_uv+)N0WMp`p8Hb=Ji$;>Uh_2jm&y9#hw?v*0UkOW*qD-EO{^2_R|vA|DQU z^j|`*3m%$@Wpij83@s;F`}bXe2iN`2qMdgxK#jBwt@M2Zc!nUXbo+j5e4ZgJQle+rhzvIrYjo(3 zUG9GrVJX<9{;RxW8xwGTX(BPLc}y*b=N8aIUYv=2SHcYp4A5`4 z@|!koT03;y`*pGPx;o)|M{qkd>WF7ZqMMp+UZ|La3<#yvJud6>3lCgiJ>>>3p8KXZ zw{bX;vp!JlY{&=GRBe*@Au?yjw%}#ptHLRZ;GPFKn#fuoUW4*_;1omL;+#1()lw`j zmErr~*_DK7+JNGTQY5@Ri?-NpWl(H^eRg`e3vzbh*YoY6JhWZ(;=E;c$GrSyStFXQ zlEwCO8i?C^?fG5KLf7ry`wh*t2gjQqHsBGJ|_=K051|h45(`8x28FAmL*( zIQ)G?aC3opliPxqM1x*VC@KZ8)3XW;6*$O<0?iYju+ZNe@RLsbS+W}Sa&1>7K~wX} zD?qa0TruKUfaVqn^A~e`p{8uF;OsAOF4lwGh6lgxGxW$F`y5^W&(yoXHCguY|JRLM zZm@mXF2>Ecfw>JfV6X`T#z94gLV}{an!4va_8jwR}3CpDgMBx$XJ=e*f2tSFgf^?cRM~*Z2B_zAUN8VElt5SzF{6M|YAo*}4o9mOu&Dbg^i4cf~|j-H&B& z;cfY!yPjkRm!sd`9boiMYsIzfV1S7Ox!p}{uDbgb=3Hod#C8r_<2xmEBv#B*4Y9w*k9Mt4`J)}C>ow?vk4ZCtLjV#5Em$Oufyuc)5K-UEv+PT6^tqNZ`c9t+ zEvm1Bzok#sW02evuhxm5W`gI@YrQP2!J~{mXucy`zdlme!xzn_Yi~spA={>&%DW2G zKbL3Kr>gJ~v|+w0d_o^O$fV-Y`SjRKR2+2QIPvOu=M46_GEcmz(fUjf+Jn}i%keyW z?|1J{X?!`^@lN8r6Uit+t#@K!+@w{0&YezbiRF=G+QP9$S|3VwRWNB+vf~=n-r7_D z`l!NpL3onT}oAbjc!9F~JQYzhF-$0)^rO)+)2J2b9rVRH6zu%yN+r5i_lfbv?`6nTMn~jPMtgjN*20GKn-(AU- zS*n)q20lmhl*`H;Wirlv6|`>(e<9J(iKP?phfw(_B1oqe2V4@% ze#c~5X}>*AVJ7`CR;C}vQY)YX`<1mPPM{i>c9Hi!_#bV-6W}dT)=2FTu*XtQ(!h%r zrJ^HX`qL}aOSK}sDq>hz@1x{6118d)Fm7zlr~79sKi7o{ouMLSo-S16jO2@-I!~Nd zw!#X-%%;vMmd|QBrcgk1yBKEaW?I(H|Lwr(_m@OW_s(%)qWw`FJ=2S#*Am&kZ)jST zQ*e#B-rEmQm z|D55Sv?vvi-JQ~Uj5A!bMfyD1!`b?ndA&)NdBTmKm=}iV>nicStHtX)xT~I8ekY5l1E?1k=xAEG<>AB?^v{W&21X~V zNTJK95B9^tgT@TX7Yp4tOT_aO00WG66o}h7Kh-~swx#`+u`~ttVe_u9SlHde_=-6g zSG`HDj1(Ffr-GYbDCSp{R#%^V6KVt87Vj+=3Ip@EUJD5i>^yT<)8{e}=FT!-9{-nt zr={`b$uLboZZF2n5!!{v$Rdi>Gj(u+G=%t7y;v3pOX#s#i7iwzo*O4(lM&V4m4*N~ zT)}AaF^t*+Sf@TLP&D+B-)QPxqdh7*(^!YplSjY$>4?KUu88$Z)-|p=7=Hc1KaF^H!H>6ls#r?y@*| zAw;3xDRf(`c7_6O<#}W9eg-ECHv3~Kwb8-vvFUuZl=&P9|M1Qn%%>p7I{3|M+T)~S zJ6K3LrQ(lS&MNfD=?=JPW^~76h5V%rcTq(;^L<7c@d}s$J|Lx45ozXq#e&8V0BWSm z7krDqFlPUI^ycg%r`(r5srfPKuLtACSzFk~l? zI*BzaJMv@8GXSOJpW?44S_MgYx!TAr#p2`D5_l5HH%Q;dk377y?BSjC>~wTf;GLxl zajp+l{KN7gtQOJ^st6Omda*P`6eT^{G<2}~m;0rBgZ|3KOE}x1R0E^bgRgGp4EG)=}wfMR6=Dp z;JF?%u4?)1;_fu`3?e*tsl6?UOyv+#6g2ZwzM*%A(|M^zI+bEsF4ZTL=S|{kicxxL z(|B_VZC4G%i0P?V$P%RXTOE{MR^%Itkf;^Z7WJd7Y_S77712)aF!H;Ne~aQja$0{Fhq<2h$`qEHeKUmlkubkGl8Ra?zo5{( z##fAI;c2tuL>cvSMS6a%8lozCKLuscpL(iXFRqz}LSm^6cWKo&(RDT5l0@&8TD*11 z;&HR*t2Ksh;Kdq(e3wVoG~=&MYN3V88h_^M8D4Upr?)f;cfRF3KQXgnk=1cayCsJT zF{$G3Hd2e4%3A5TYH0`49zZWFv4X&+%w;CKRK2#YfsAH;qHefe(d#SvLi{6v@32`? z&aY?w>`hDmJCR1E3sE|H2ZYjPK_aV*<-_CqS~br019r;>Ye$mG+}ciet)v7qZMS1Hqs2QEjZt@*(fZ0_oA)f$J*Q|l zQ$-g1UY`JZ*&>#%Jy9#1$iS5^Dsp=A%(f99YxJ>HPoC3LWVS)HJmZ$<*xpZQCN$n# zPVIb^Y4VGlZUKdi81UOdb-lZ8qATj@o#Nn6j#}H7s(xOpEiU=?_;(cZn7o=)Vm zF$$&Z_<JhmAqZUy2=uIb%2>qxQCH#$=2S-L(3L81V)&%V6kSBkV<9uN zif4aP`8xe{W3jkrv-q8zMFO7*Ezm%SadBc^tm}>INwGniJd;Wqg|QfbxRrB+9Z+|j zGD~H%61Sar8I3jfHT0hbONyA5PNO;8S1m8Ewf0Vcw$(pNSTbYXEN)&R_4KlAzECsk zWi%_TcUjp-l><-+Ssm$iLGmt{RnBNwqX;&+&E@uMlN6v+tV@D76n-rnoTtU|-O1GD z^?0xe(p<^)w$qS{!)<4SFqYQaXlK0a!U6U%gv~u`vv;rh9<)4E*PY!fB-8+}Ft>4- zZT|?cR8hHm_%1A{qsE9cM0UedVCjUPdUt$zOA2)_01=d$vc7^5%L4`+QFztO6`#r7V4!+T@-c<9Y% zd^lz93hGZV<(GMCHnuNHl@ce`9fZuz)KHXWu)huQ54oJYOmC^fRlBkn!T{AW)JCcW zvr=O`*30OZiDkP41GJU_IFQy^#kW?l+-*hhM#{{Ee_F5u)z1;usDN3b-a1upUlSGJ zWGq4Izh}Diega?9Xni%kKW{9JX-F&?x`BT%RvC01S9so3no5_$*WA85h8A-QUMsD) z(5+)YU9v8#q@{Y(i3D+2wX0KnAf@7H3eBESzBNX?YCZuS7a_JTHNVJ%{!EF7_LV{? z65$6XD~gNu_F64w-drP#Eb@g+P1tS5MQ%Sa9dO4yo4GR#NAqzVJ;e?188wA_24#F1 zT*iOk*b>JI)qBM7b*N7oyZ9k2Y02)L%(P5{xpzm-3(O`MKq0pD~vb4rh0K^rCyon>aJ1OmYJn)9RR z>>Uf+nQJc-mYAN4qdCcm1vEZhfvVM%Mv+?KJ=$`8T9Wr$z6(`B-zI$#e=yAh9Zo+N zM{8+oct~M-7Q>5gp2ql?`u+6H8~|Vx=CRV*O!3ev*CED#sHOBG)6~T@OU+l*&>d>` zV3jWoQU%8hi!fDuQw;@|RD=(Iw~a2|c={uHCCPf9OUzM8i+DifH+5Qz7Gqm}qK8gt zAsoLXG;OLNUmWfYT?)=W^Qx0+Y_djHy;Zk}agPjiLd}0$onymVO*2#=(gP3Rm)WcJ zmK&5L+Xa^xgsJxA8PZ~I?NX80iUCl?(oC#9OEl6|Q{)!Un(fqohddW59ZSDg@PQlX z^+&|q5`fksTr(iu%q^c+q@%T#9 zPa2*F^~6e=nq|43AHGKwbw*5YPc+>d$G=fmK5GU*s3p)M&&z5IVoH_q=BxKaTT(QR7$=nBg!qZ>z__Gq(k zih|n|oeleMIq|1dYPYEQX&1l4YFj+OD9(HYa$n!OGrk{1_Cj)_{}Ltd>TMG-h6My70O6vgy{(X9jLl z7l_woiAB@rIT3rY{w(HLY{gtd-oWv&Mwz!5TcW_fEt=W)N$Wj%1TzwxX>=;6kgo>R}eL zC+$_fPDipAe$Tuan{N=lQ25LXe{gwT&bA+(C{?K~J*<&$DWIMSbo<@-DNm_cA?XWQ zbf~CyHZM~0$6MXx{o@7e1 zs6gKtc!X!vU>|=JO%Ch+JhyGsjI5ga&zacZ6;msraMRD`7kAjDD&n_*f6z9Nl>+~U60rT3Q7&b8RFA3bz-He@abxpZqv z)wrz+b8h~B3&Alk=M|gtJb)NMi1kN^J7Bbp1c%>IMEis-(h8gOB^S=qZwosQZ?*Qu zl2Fd?#jGY2=p}pSjyK;cQyJbhv@ED`uP^tX#H%EmGL`OCjSwvo?@y>JAu*z zsOauYHkF^+0>v7tV*6OlzIN}NO!4uvQ~Q3qXX6#XWmFGy_ze_BlxkY ztl?~xwg#)%J5{PSzmuluAoHbP?65p%)5{ssX_$K)b}fxhs!1*6`&tHX<%wzl4xZgc zHyI(Ah68cVz2c9_HM{$18bK#OHbtYE&uJ3p#KOz5 zmhLE|I`zD4X5U)C0;a)2`dANRA6e4m&E(p2waf>c6%-$_;=X^q9}@26bdiQGvMzhg z&6FE&Ei|0kMHe2TB>(z#whvt67{Sw~^R?pHcycW3rH@NhpukJ}0^*?(4S2_X@v$iV znyq%z#}e$>ba6bLX&YL)IiC6op?fiDNzPGncHJf`Ex<+Tu}H4}#sLYYu5i6r;M$xb-0D6d(KFfI16R+E`Rf6WIWZEk46WPY<&7G?Mf8uOXgT z6wQ77znUs5tEy+(Em`Im=(?7Lmo=OH^jil#pCyizi4US-3rb&SgBluFNM1buFm{;z zGBMJ-O2&>nG^QjkKZ-6vUtgf5zQ^1U5k?vEi}DK10p)zQNIMe~7^kClN=8=1*BEf3 zBCa3!8E)p~``M}Z21tVe8HLgViczEhafm1bSA0YEY8d{opW`pS9sB=KkhZLUeI zq8ASUY;)oOM#waArZQbni*Ll5bXL34ETkseRAGOVQdJb1$(Hzf5)a5@HnVkJ8B{!J z(>!l$24BS3!Mn2JooSq@ZS(0j6~KBkexae;haM0wbJK3KjFww+!~iOV89led@`H@v!*H82*)3IYNh^QT=5#-Izvm6U2|y z;`Vs!Q}KMGME*EonMo&NH{`d?6*D*C#&w11=fd)h8R{BJ!+^J)Ue$l|d4HI}>Az3- zb-jXqGvD3vOxPZS!!X&yzgVVqt8s;C)V?l{cp@Ti* znoQoEL=VpAD_hb2d+_+TZWTtQPjAR=ukY;ai4>JI`J0>RXPr_plCSdQt6bVl)?vM@ z7;w{GaE_%OJtppczr2ckW3V2Gr>Cch^Mjf^kGsT~kCxQ#d#uMad_7V9KKhTrV;-gW zlKuLsf_VmB%U#hL4m35l^w;N4{oDzh<4<7@4JUK+D{fqtOrl zvX1CTGRvzbxC#pk8=-Ky*MLVS!Ar*ivJy07Bt=ajvJWq`oyA z4Ci$;WmMiMB1d6*#k)MAyqRB7LFKWpWcm|o?n++w>ZRyxM zM;roeh|6vEwf40)Hum?nH|E*w_~vehd|8%_NBQpS<=bOPqqDv_R&pd+%nCWhN98Wc zYcva{QJ4~rWnEHUgZRi{{;yp6&J5gF!APDkv@f>@VFiu-4LzYqUyCgl-}^XEnB^1{ z`JQa2sxZ&f2xeM_gR&mrw`J4Y>SG7&Po-KL_2L>c%NZ#a_^>*wcg2e0=I)-xg2te& z0N>kPEGXJKrJU=zLhIWksAq&@XnnjQzi|B_72aS=IL0wvORL|Iy=M=mOBs!NQsShg_@8DMGWXtds{De2mUBw$GrGpQ z6!9(VSr7kq%t$qyA;Ye$oX(PHH*DT3+B^8UgCkRfeN%**d9mL9de>-$ySTbXm6u=8 zcx03qF|s|s`eMe*=#LhEYg1uscX5*p!wqRi!vFi)7}jpQT2C)7v2I(p^lH7*#&bN7 z!XDR)_rj}5U)*^9V&#kUN&SD)v?#J{j}$i7N5dsK{UseuE#^pvFlr=4>q*i6P`>3| zR_2~M8Z~Vf2(Y|$fosNS98P#`E;C-ZIR?e!b*kLbnX(J*`cx zeU{{rRG}hOsCkv--ruTOn53Ml07AdhtQ{BdUtJ*vuT?JM#3)ED&h zbo7n9FD!gtFr7b*^|rp12A9*>I5W0UU`4Ya6FFeQ;LLH;0&NOp7C6Iv3LNMd16nB^ z{!ad8EXAe`98IjD4~k8WWkil2bpk!7%LUrMCs5xTDd=x$wtP79iBR>4P=i0xjbVXV z@q$`(A)$%{Q+g{JLr>n?D=2MNK@RPvgMlH;o@*%SG2p|=V=#y z9r>MA{LX5QD9IqwQ%N|(->#wfPtdQX$^CRM8WTYDu$2^VrOZ?q+!|J;SxU#_9F;!S z2#PvNsV@4yL=22eY$d0eZ@!YgQbD|rmVHeMY4rezcXW3x=Mt=dv^f>OKGimjcF3os-B$T5?RT(%a34LA!LO=D>GoDGdMV1e{9ry{3mpK%!hRq= z)@@ZL(JgjfH%Z($fq(H&cN*hWNk&kV^6~&uQVR1`z2^Lp-Ck+8R}=Af2Kcy2^@9)y zWZJ3aDth1_backjmt^dDW!; zkLkeHPb{TR(?qWeIBx8e8q>FjN1r{+@~JiWI)&ToY5O#w{v%Pr0NLq#7h*2Mu4Pu`6_%7l(RS8L?h#+xnt#dX@UV5OI!?IqK0fnwW(|dgJ{DqHZP&o zh4gqm7&Rb|C9%Gx?dfdEiJiFW?>M=+Pl1R1=fWfVb!d5*-6pFO*E(@GO!mN?1s!|Z z2S0M@U}(mlEe)|N0!e_$wR5is&PCVFR2hxW+}1ZciW_ zvc=*0#f@ z8s(UoApO-!kM%S2ys7`@yF=7dPE((tgYlihC^qn`r~-5LDPGdn<_mY1GySm|Z z67XZNFpNCI%bwxP;r>=(|0(Qfea|#ORq;iZYKggp-k2c$H;dJ`{Fr5$pfUYYJg5~6 zyC%_w6udtB;#PVx2Q4~>*Uht4-QC@MNds35Gb9z!ZC^5fMb$G~K47~qtQROfA=eY_ zY>uvI>2(|clH?H9;d$O^=3EepdpSaw-{PF)0cnYp2%L5j84 z9CXl|@?7JJMpjmFlOV0Vy8Lwugp~HIURM9>K7o~zAwCyV!REHSj{4q~=3>hO+51>N zN*R2u8v8M3dF83nFXsAvNGvk9SF99RnYhD4vuyAhvaJ^y1u8J+dP?%cJiY-Zcj9?uqW(m4R?3cTlz+>68`=wy^3c$;D@U)b*@x!qlQe}{vL4Vy^O13)T{C3s#kwpa2XvkDDq4A6Y1{5g7l!8%!g4$M}4#@m!#B1S*9mV(l&N!3*98+ zOkrSr>8Caa`!QPtj~F>c{L*&-ckuzifVmWYA3COxyx_P0@4FC~{J%eef(O&7COI<_ zwDn7Al0c@*agPWU>-;;>QGO8^Liqld5@~iZwjIKH=Ei1z|o{K5(|$Z+!A6KK#USgth!RRAeXSpQcp?jpY# zevD2)NWE(m=E<(Fm~cc{pZp>7Y1b734v$XEw&h0S=5#|@I-l&|l7#7zebGxjKlgClWlKEfGn>Q7sAsbq1{-E*^ON#9) ztr3!4$j|~!<~COp`4|u&1QI)S2*&Y6l&{I`0sWuAe&ZE(+Ag0WgY)oe0 zeHk*0K+d!Z_>^6hhw>tj9bL*N!&T|juP|O>nj@|>4!i_)%bfp(SX_#2!4wnsHbU*% zy|-G{DC2VRSmK7qt%G|^NLGZSTD1-UK4g6+<>52AM01uOrj%LTOam%YXXQectR0gI zVxpX#(k49i|A0bXZQ6c^{L_lhytwpge7PzSFCAU z#GWck!jsciK0XFy+}}YPM_rOVG*DXefD(xjd$advK&K{WE#Uzi)aYd7vw7Ej3V+m+ zFFa8FF2gsVRJS~3IX%UR8^nnlwH?ozoDCOf7;vqB+OkH@je^}7-wF+lXJu7KS=v6K zN&6l6YmJS&ZlgCHM8Y2;GOT06KafYcuoxShwpw6`Fkq{t`+t4~hCp0D2ZKdK-;+Qm~?qOM4hYQ)uiEeA2Q0Hcczi z|XR6dtmHgWayI&20=e;>RNE7?fGTF!a3&2bO^? z2b86A1ko9|oZbAuB)oC8$I_p)-ly>h%w!GBN>>CjzPwqA-->Ki_<2sSEh!Z_BX zoX(wpUhn7z*%F2#uk_O3}=#P5rtxGF2LbeOVR=}6ceA&;-v1RmK9J)#7;Lxpu@rXT^H60x3`fsTyn`zHBE8RQ_(3Gg46w zG=-&KV!=6pPm%O$vkV2jlqTrSNc}sUDj4-FW7HjIY{0gPhXGC<6&B&)JLq9mU;tZ0 zAd6p%eqpTXW%b34Fafj*hxewlX=z8ro;ZG^MiX#``n2EB+Bncb+WqvBI&H3v3F`;P z$o)*%mLpb4CG;p~YTGi}5!TTV?7+|5^_MV4??vB3yhQ^*@PUiMm<@5M4M_b${nf7j z0B85BVp*=U3^Wnk7G9slEmIl2MY7h)o2BxbbJN~e)(Vw_>sTP$secPeJG3K?+0yc4 z4e`=H75Ke9Dq%P7VR{P87AR%g(W&}XeXO6ot*vE2uc7e&hRPPpp|V~EX4G7*PL9|M z(U6vfmDAi`UWTXUwK%+-(^|mw1D*Uj6~8VPRjSCf25bRxP~Q&K0ECl7sao{S!LZsq zG0%@oP84=>{%WAlHjahY->9ia#{uJ!p|4oqH*16<1b@qcta2Kk*;7S2)KzWKBD-)J z74-16*>7Nn=o~ud+bEq>m&TaE_(tsrt1UgFp_jB8SZPDbL(Ylbn;^GFqV}cyP_vv3 zD&=TtZS0_9=tk)LoE+pXM^W*ss+K3 zxLo9gc|0;}tAKq2=Xk6#ajd@V>-c%kYxtva*8gboUlblFzn$;}>Al&g804@un@fci%-csfao&4eCp7d-ybO}=2vcT)DNmDF z#{zbNgewFm2t3%NdcDhqX~sw^xQgbM#0}7^h3HJ2CJIcT@Sg8C);&0no8!HYXMa~Y zDe$cVF|u8R`QuPchAh%X1`j9))r-NZF=-!`sX#{wb|Jbd9t?h1V}kSq4eYc3st5GP z6w<3!bcHSTa%EVIGkkuYvfqf;2KiKwE~urwu{4zcKr#F2pE}z_)};>^y6PkHa`2%v zCH~~3T$KMUKCDv+Uxf`xWm}ckj?x}?5{7QzR0TeAOwwi-pmPFIL&$Pcg}QwGNG|r8 zOUD*`LfdBap~s`b=YtIEPW=&d=f2e$kVl4F+wvXj-0&cXpuV|Cfbm6z-L_Qf;7aFu zC$^s`urbfZ$ukQYlHa2G0(MB{ab<3Knbw4Gim#D)5R1isAMUtL6wb&qgKH6`Mp2@z zRe%8=H5e|fMGc?+-`K=Xx;p{kUCP@1K}!Pd(3sNn80)w+!V_4?Oh=Hq804h}q>5rd_RR4RoO3&A_LETfyAk5w;D|Sj5;l&(;a6VIB1OZU9}C1=s&q zmL?-}%4gdzERi{Lz@A(Ox+1PY1I<)PeOlSjyhoHi)zzKEo@@21h2@of>bUs_KIDrQalYHpX@Ql(lI&q(A))$CQh zeK1hHXVpfVj)q7N^c0ygmUM9xC{p(e{IHhgI#v;Wd!w(*`#de@up&Pe3xR=F|I0-U>MtMq ztYghRq7q3=zP$!4>~3?1_YZ!hI@Mi>l+&)xh$WU4rWv7NZ4>4CC4@v-w$$|>K{}v< zLAnttV#hX>@aNeN$Sr2iIyM(L-?jwhk35|kUy*&SXGfyRrqLnbrn5;}&FMe-(j40#{NDKCSnsW= z=1bvE5i>ufQYR`$(_IFkK__P+o86O=Snh@z)~+6>{7r=-lWO$dW_o}$^*D%%FUP6t zfWGY%uRx$jI0)Yxma1Dgi&{E-hghQUyC4G+k{zlB_(7kDEl*~%CNnl5OvYLFJJ71d z$;q(F&P%1|Q)p3&5T>LQ9hQ}7kZLu;4fe@#pz!&4Sb~+J-e)_ii{Ke+u&WKjcS+YJ z{SC98Acw*>b}5fDi?DwE%gAPeS!u3Lg)l=pU{oRAnVv@~NmE3;8tx1}lw|5PK$TVx z!-Lcii#seDt{>cKS)Pl1LU!wz<-kfvDfpz+@+yYhxz0AaJyBc2CJ8sey5aj#nvmpu zPM^zi6MJEOQ8y~UUqUW?-O?#+&=2rNq%92NvCwD6W`y%1CUoMjDupLcx?TrUW3&g? zN9Avg(7&}{;-Akh-^O;$aBej;pik2pNxoI#_g5}G2I!gBF4y?scI_&j&Jk&X^X(>S zsR${GaK(mQ{Cb0oXg_E)%5e@|olT~1B(Bz6UB=|YSX&r-^XthYRZLq2PKEM8&nCty zEx|`NvEj|SE;*FO1VhPbb2}7s& z86(l=Vj`jP3`NItGB>c)IG+4 z(;=3~6C=#GV4t?4e5Yc_FZhn(u+#3FmU=1Ow}7!(Mn>2g-{RCR4X>pmiP&d@!dsv` zfXh5%CIjh$wY%^?jxTFr!t2U)u0pnJs}3g+j1CLqL?i0#^T%0^eo`Csfnk~=vVMM( z%2*GHO#rxfr2&a$-)PY4$ZQ$Q29hCsOw6GPvC5M*3z_s?9DSRXmac)>s9V_wES`*R zO&nv8O~|bQ`g05zcS;jn_KUt#oua!G`7C~s zOt3XI%)OeHXf>dpb^AQPs0wdEFL5dzC30ghX{V!4se)Z;c7Oz>0+ zO1BYHMW^5FvW*6KH$=gv0L6$xO0NnvyiKf@i)?Nq6Q){M2dJB$ci$HEdW#rOv~nYI-jZv zM|=nh%YW9@G^+f0;q>#YKKs|~G`cHR_*J>Y82+CESxay^El06GbpPCX%8ysVz#}Jb z;*9S<@SjVWXMqQB)sVr+fdlm`w*kF?_suvRo`#fQjH3h+|LXZOUb~>oNIVm-NFmYp-M{$}HLE zuaSVDnzj8-rUAdYQ#@l8K9wudCf|)V7c>spM!BU7+`M$qOfOn*87HKox!4itzyVH; z(+yhH$XYT_*?tEzH2Tor?{gi+utraf4tgkFVcsz^m1(D{vY;596py6zb4gifK;Upu zoR=BNhN0LuYidk}jKo9rhY152jg@mQ<#l4SZFY)l^%^;?bN4y{;-lKfYm1I0FV5Ay6+KbFg5*DSi^me?i(L$iing zAe15%XL$_aI)leTkU2||@Erdt&AVvlsF?muF|;h+0?tkb(!%&k>?0BH{?u?Qx)bQ@ zTEA6mLVyMB{l$JzEiN5H!E{$4Ng}k8E{nU4g~o>;~qX@x+7MCmb*9hL`>q|@-fVQN9?IFm!j+hnbX!m zsV4MJ+pSEq|8=DH>J@EW$F}2jId0MmAL(QiWHZ5}7^YPa=ab!n9>FOpI5GNY>HK0B~HOLbV5X34<#>;GgEGo z3e8iy5!ie2PZf6(opfOvt#AnUS?|e3X^B_dZIOlkN7bppc6brG0}XG#m>eW;e{G4^1X?4StjgvzCgqx z>Vi1NrI*DE%DpV|QKc>tLOl6_FWpyTRz}kk@qvW!()3HRB7sn&6CM0OpiKxN#8NZO`gs6g;PM^J^c_%J&hz3Y8WG_F-74`A z`iEf}iNjXb1b^m@WOUY{OhShK}h`N}H(J4XK& zi;*>#8wH`A&x|Wyn<`F@8%Uzpk}FS^r?d536^(|xJMqR*;dBSXT@m^wohy>r_WBT0 zbc6>A?1x2FBNE6@>+ng{jm2jnr$z^({3sg2q)&yzwtMeHddrXzXm1Jk;egx~u%rw3 zRz58}gv?^vq~mdSZOqL^Az8>eeH62*ZkK=`TAj_gt9;y6* zTVpAb%tFH3#t6nnffMoiZq)D_l7&9r zYGXw;(;3_~_D3vqNUFKh#a#1Bne{_Lz3#Kwr)ANi9CfdFA&IJ!-3A+`5#b)wYq?@| zx@kG~8vc)5--Qin!2Rvgd$%D(28x#^TAb_M8@Sf`sO|@M>5-v7gn6j{`0puhk1RU}!<`+$DeU+U_>Vli1yDONP+JwTGxRip5MxM_9XiY=o5RWHlrPb+k#|^WE%(~XYSJquG9_8`buc+bYCB91q?<;x zx=TsObdybnQGb|6qiwiu>M>>I%&kd+2-}yWX_w28N3zS0DF*i0?ojThmmREaeVJ|5 zjHv?MnT9j(cy2pqM8_Bh(t7nv1E+YAI{!I7o%5#DmKOy!x9l3hy4dbsWgVDCA0~5E zEE;CsGf?#ny~f93C{AA$Alx3Qse_-%A?khR&(t{BFy5bSgbhlA*$<+Xap;CJy{J%KT%=QPU>fiYxJ&r{r4Zz#TFnLy76ayz3N#pMRp~{@h>Qre-aBzU+B+b}S$5 zvbJYerC(;~f`F{o%xC1{T9~DD7ZU8d&G{a>MdInM>L|t$-EeQF%i|39s|j@62~(gh z66k6OQ@Vy<@4}hb*WKv7f^}NH#cuL%i{%xIPV^Z0A??5gu_c?=7qO9$tBSmEy&wQAG z%`C*K@libbn4J~ZQ)0LH3J&!2Cah>YOOw!PvGCUuqH}!N^K8q^1a;rPA49*+rNh9bebQ(&R4BYc) zlf{udIi-l^YZd0Inr8*|J9+4=M*7;#1?DDX&M1#%;8Yk1Vu3UJwhl|+!#wFJ-=?tG z_yb9loQcL3((7$_tpnfV4ZNE+Ft11{xHRC3b@TRk(~COP)D$)XFN!mbpIgV2=L^CZ zv_7M>(TSZLF)HYYd^ConG|SiY@^}`QJ6#d%KQY9vtGOsIU|@v9jgu+48n`#PbZPn3 zY(oidVX~@I5fh3)7J%Op)PhGGCmHL}4{F2|Xq8$cm+Ryac|oRljf<;H*b@$rcw4MR zH}I4=Bd2O~48D_7Y$^oXjHZq?;0V-+#%;5~u`yX#tHXMPM1rZ~y?67zI1^~PHxvl9 zu2#X3lC5b9gd513O{X$!bpy5KElS}bUN2IM2Bp+Hr7Ut*pqgS9bRSg-H-5B7Fd=v0 zq?RTZyEigwj&&I0X3QxPu8=3xROYC;CXSVqey*-SR5=X#wgYa)Z;?dNDQGaWYx1@8K`l8FcD=?Hd!A&v%!t|FNeLKn6&s<>z>^u= zZ`GLPkXcuP?QN5Q>N|ZVbL-U!t;(Nyb zqoUGsidFI2SnCv<%v}6OIqkFyt7{Lc@G|)nGJP6f=Rr5p@l>Jr9^4y+JMxuKJwVY8NsqfbD*R6o5m%54ef;BATcb zTQ8H70@hAo4c0YgnO?`;YEeoQUd6;bM7dV+yAtJN)+J+95L?iYwQOk$)utzBY4CQt z%l4URQ=woHuZ<(O%K4-CASw>=c`<*l7*yq=`xe9z36d*clkKkq{~Y`fV+b)%Le+UV zQ&?8<&rEtdvoaHq;_+GZPP)585(EpZL}ATJJyc=zgKIHNtAgaL@3s#E333NLl)e1MDXul+c=7#9_1-9ZN&IRkPA zyDU4&K7eoAMrd1XUAMy5 zEA2C(yf}>X8Y4=T_s6K9vZ9~S~>x zMS=%wSYZ)%%0j6@gRExA&jR!hZDlVhCK@1qYiRMIC4>6GT%y9+!9HjNh~p*}{9gUI z#*Zn?iZ{jC+4OrhZMJVc10&+mF)RxaiikD{b_?cNxvcFk?K~taTi(xuRs_*&=n$JB zF%u)Dh`+>v>AyJ8t$0qIuW5j6#JV{P5*pagt~S%sWPYHm@*TF^wVAm}E$GeyLo;+A zBrn^&Bc%q{xZu#*5PvG%KL4JA1u&JXt#)q0w?`YsA&Skc9??Ba(U_o*ft@ zDc&@BD)Jt?C|*^5uO_?8mBWthqs@u3st(7suC@UGx)9xOU&EH~vg7!LBx3X3n|KeZQ9H^YOalgXl+S6NaM|g>~{Uou4AuCn(6L+37)T z$+;l1GT1UVMCcl3S}RX6scPnA+9JJXX?7rzP0{vF^cx+4?s+YpVpX-ucl zeN2(zm$X%A+w<$9*eF!|F$(E@p1$c1JQ-E-=3Iz)rxeO(s zcQ`~%I(b62^gVgg{>Co@PML)cI@*So-j3L$MYfN}My$exTWXahPiW%gW(HMOR$3=coH$z9&c0L+(l_tr`O#q*PD64!;SgNbrSb6MTXub0 za-ypdQefQU#V_;cyCcF{ag5xDc~wpH;bd953W|U!D$1u*_DaCtwpF1bLvICp-X;?; zo{Ng<!+FCD6H`A+r zL;xV}xbl*b*<1xk=nHCC-f z2(B!WVQbv(`EBaX=Z;}?7a74GBriQO)HmW=LFf`~sz$Dbf?zn~3t zJ_SNr49QHR6Z;95Ite~edD+ZL2XpORdKqvZpa@@g&~&3zI?*fy8FjTnSq#vbhjNAB zG+UJagF&=NPp+9P<`B6V1c#rKw9ZjsQsNt15P*=E2Pg__=xLF5^R-c9pPuYZv8!;v zkL=Xbl`vRZeh884BFokMbhX0zYB&-nKf5Owsps24G55M*z!|+4s5>~|ZADs>KU0Eo6sX>aB=iMX?QdnP1%wK$nzNn{t&C3Sz{$UaA0@%?CyWL#~$HT8Z#J29= z`0;^KrsqTQmL4E|rqZ)mX%v6MFMkI)8Xaday~C%e>9bT|A}%15c4nV#!<4-dU^|mw zzp7k(Q4c*VLX?VukeDuhE!cogQ-!E+zr3KXn&xR} z6Q*0S>-f8zrEiFMgs#bUF0gH@Lel{7{NjTV2EVO6I3o_*x6E z3!-PUNifm`T+g1J>6gBAD`(jwSn|z$D}Dd|?1q#b!ilf>&wNt~ zT(t%4J`62*pyxjghd@k(2#~yKA(L?Z)7&=R5rWXn@d%TrAxH!cw-QXuIi_=5e zhvPRa1_vt8hjooP6Mp)tT&*HDk%HVzb?I^N<>10wl{ouyNj z{uc6QI9A~A-APA2Vn{oiUwXvp9Qezykfw$Zt-%>s{}ntDQxo;&=vWu#yo+F{&@-34 zgM=7DTNZ5O#rZW( z56NMHi#&@{Sf{&}k7>`af6?H0`b#qZYH#tgtig^{zI`g4q$l0{Y*%~pZ9&c~i@Di5 z2$p+PQ47Y=as$~Kpod@^xzktKYu!vrZbwxCxRRs-`AQ{drtqqt&t`K`g?` z`Fb!)1J>au`TK)OUUyZkt62HDZAuUZwpQ$LxTb*+LbvC71yU7Cu25T|QD0CiS@mL9 zeGAb^oQM_ei1QSttjV%Bv_i_1wqEVXBYnD&-aY6Lv}4J1Uxllj=88DeUiEc^*n0F* z;YaWl8m66h6yGn6&UDbfyU4?ut1Pvu0f`FjE%+mxUyvIqU`DR!r*tYGk()t)Z;~-t zBs5BC3_>Hb5-RT!l8lMP)oJk-`xi{TAL6`OIuN%=*#Ke3P$X-M5}Vxc7qUgt-~3-% z`i`n$5o@+M67Z?JN6|qd`?P;Bxv6qYkC_%kO7R;ibrLuzz^c~9D!(ywmgXt*l^@#= zFr+RpJH~P#3!nAy=fvEBeLwF#1wwuA879(E3Bc#8(V(%HtHNKfZ_(P?Mv|8UZ8^y; z^N<$kv=U|^#SmNL3NlX5l)6Q+xBcy&SR--e1%ee#NSE0N^A}U=a#;09{0l3^_3j9= zK1|j)S|`*ycZ|owq8P13f3Sex7|54IQIQ(!tS9l8B8CUj?!K{U)&x|tQmqB0)+ko3 z>JXPmKMm6tl!7$ken!hRE9R1Qqb&=v`J*Z%X#6uwjwgj2JIV3Py?}qY6Us&6P5J^2 zLTjT6_4O(kuTe`ytT=5D;dt9`9Bu2@v_rVAf=COavd7VZfpl^waw~##x1)VigE2lU z^3F2sQ6^#MStjOfXg!64ex!X^#1F29v)!01I+fn4lwM2 z@I1QLF5G}F%ljo++2hV&nCOJ#}>&EP682eiit49?XgCiCiTu&17@9biP z2U_OqF`F1BY0oBxe^)~=Y-^HKXSWCG+jmT{c8ciQi~C^B0gySd`Gh8z?Hy8aX%omK zQ+z?HqJ4t9AJg&SU52~MZ2s#m!`>Y+RXJ-8OEXnup1QW5lG!unHHK-wb0I5SHSswi z{UdOxRp40K4tM-(2%9{F*)rxWhDqM}7L$K2;jdwA;;?+*)_b-wws`{Zg0t2#Gkjai z%<)NlRtR-!NUk|6f+V#o?GsyCBWn=Lt0thfBR$1fFshWN!U4eaV>nKl=yr-pPBJEm z95WMrB0Z=gf5#ySc2-k^KW*qvnmDld3J5op?lB8rbl{^0lHNTrCJ)%Hi3PN8do->P zgD&?4qX-+TMM&a=5g4sSEGF7)+rFXY{HaT=PD(0LV)C20fXF}iCp5S)y~ya;}57v>e#PHz_K=F#1LWIGrCo-aS2 zOEE{&RUFxyPySS|4D=U4wsl%`F)5rs`Au(IsAh^YS2B5R2ix0e>PfssL;fycCVOX* zZzBZE2Kt1$Q*8?iNkC#WOC|@=^Mbg5|0zZ2m5lCYfwS5g@ktQbo<(}2xQNb;AlXd) zVaLEgiT;iZ1rCni)%;$6PQo1d<$hs67DDb;Rce)qArNFF`o+B0F}g3vXw>z${m$!CR8z?H2#IK0@;oSKzTpj#9|UE;vP1JMD#|;TUpHRC%+S> zFgvTACo?>S-cEp|Lv{r?$?-hPNvH5l94cRxRy+MBjv9=ace`MROs+=gCz;HTu%5KM_*};mF ztrVxIgg=y`sSL|fVT+NjxEw9?%I5~Y=N>UE&gbSoYg7`xI|UnNbc<1sS>*e(zEvyi zsNb1c^^1(cI+JOo?>Cce6Eri!M;-fNMk~^paVTi-1XgJpR^e5e3paaP2UXwlwz1B! z9#e%aAA5|8dTsMDb?n=>j%gA;vL4sAKbO%>mwd5bH(kn?Gh304-)^2w(!Sr@%D3~6 z=VtqMWejul-SqwE7{%7c(XfO7{7gA>%h{t&u8PM>EpnP4-GE`Rh@G52(G;`=%D@T@lmHwk9o(R7(%20hi%YSsuHRr0)wO`d-89ljq#n3TR z?lr6$Gxfm<{}-k`!prohzxa=JEBV!9XB&pBmgWW&U6tmAuUwtKF!A#0f+Y@q4c=__ zaOaA_LxwwzE7+u!qie4@S39>|D_UQ3=Nc9czjoZ#W!WQK+tw7Vb#2=)cf|OeI}QpH zcJKRm#Dq5v+*vze&tbg0>E+jz?9UpdzcgfI=>b;xb1r-ETG92=Lrrr7%MLYfTUYk( zy@S`w-v8zDy7Hr}|N4q!x(;7f9M|>0M+uk|RQYN6>t9x$OuVulx2w(Q>hn1rzN)@B zxcIA@tIky$YOj}m5md8rkAX9qi)X& zTRHCb{KzeAOeZ4sK_Pm0Z z)4toj6)%Ure!0_6SGO+d>cbT-%_KQt%d&*Juvc;-cD(jVulUBoSM$<2=kfu0InEVB z#>{uF98uO7zH0Oo@zSj+_nKFYog2Mvb-^pcchh(<lAa&8RX z=!o$kNO1e-g}&m+`aD;>;Z2zM(q0jty3x2cke%&iS6j~yk4aTE{|ClckpV_x;GAf zQFgNQof|X$c<|1xSy(-{U+xrtxOwG-hlju2wr|1% ztKE^GFHiX4$ZvPnz4^f(KV2_B`q%IGe>lpt(pG%vqv@RBRrrsp@U{+|mT-(~yZgsu z+793UcwFBj_R&Yip(XoH82wu%nhUV&;AP^+LiwqQpIBbhR(@(*(z~)TeBCtrXVJUf zx1WrC2MaX*s5a?T;z?{`NoRXkoldzvE$K|hyX&jYr2T-ebot}Y$7j3#jTeLR$E!)_ zGFk<{eXg6|=(o@J2;ESPE#r-!Kkt?FRrQ6uuFu+E?AIf>=3@U|U6L;i8an#GrJ)5I zYA)lE`sMP-s;>@Q8S|p9_G(Pt@NPCVJ=$(Pj^ zM!$nEOz&``?*1E3ZcKaf;GLWDMf>;V*^`?u{@wDTn~X)Cp4x=PJh@3<-^wsIBOr;|xHapOASQl0D3?%B^5hn7G4_0U{pQ>fy`48!s(qOM~ReDu4l%K+ys*B|O%3 z$|~+d^vdBabjE+eByY-#W1}~PXTji6raT(-FVrCXC}R~jC9YC_myMczOu+Z^{!2qZ zj{;J({}@i>;u0cCyoj6RL>r3XGUwnVaj(`12Ehn{;jU*D+DH3{0YiE-!S?v!K6ZGT zlk62toJF~SOCP?}-hshHMu;1qiDs_l zv8T9RC|gJ0WOt{c9D55=FWc!4Q_|=IiR6|Fn;bKvN*B21%?4L$aC0*3V#{Y zgc7Nu%E>-({=(Fz-BhR$NWVduHf;7k-dabLR<<&V!>w@D0$xy1P@;U#oox_sr=fQ& za1|t;>kXwqi{TXyBIB&8ECle8;d)2UR$(2p?^|mT$RoKck;2EA0mx{vpkM9|*P`t9 zhi3zw95imqeAg_AZ4pny4r7yl0t7?V0VypyYW6oyP}P*`13HHN<59#QIQC^w6J&e% za|Ni_%j!bRkE{jW@i|t1{D@-wp)!6XgMKyI;iY2!iF;r@xg^h7Bd`sC2Z(?7v0{>X zCnkj*k21M3^28i>jtb9^a#RK?i%2(G=mkJd!+Xs=L8G%2RkJ2RdOOI8aj!5wIg?xb zE^J?&6ZKf>SrqP)3_!rgs$x?Us_UeN3ph(Ikq43>o&TthC9QqLtI6 zrgbgG6QwTrB4g_K5g^u;bOZVoC2Cn>?^C7b`opPX3mg@#Yk`>{-%`&X+A26}8j z5OpMRjkUwqz2VeiLvnc(jtgsgJ(uZ=N6_kRR1uczIS!j&9P%#^hw!|?E?sDy7^c$V zc0Y|OEs(6J^yzTt>`QzY0XaQNxGBsO@v+egQ%+fdTkBXA-ap|#E_az;;o5NmaP z(V7_2j}d>NA9C!MXy6pG>!U;E*i{C0&!yST-5EX z@h>r^&)6??3zXNH%BD{YWuVu!XE-aH7%rH)ITnYFgFUjk9Cz25*ox)hK2nLSqQE6> zgYvR~t$SrRApKBQh9cRZU5eWMw;^*dLJK_}G1rDG#oMFoV9p#X=8( zbqjA4o*iD&Is+Wp!M^6_F4G$DYZ;~Zqjyfzn{Zm!RI_`bxvKDVjoU0fl?{(jn!>SG z=!n+iScGb%0v0e?KQ&D^f2Ef*(wZl>lOk%yIQB)!0E-q-LXoSv3l!p$fxIm)maJhiW#_s0 z*Ljasv+Kj>ITw<66(Are6s@wxMj;lda3LB(%arv;EqpX#=VCDxL4Az9$6IIwEC+L# zvL-+HhADb>jtP^7@=NsbfWBDUS$@$7zA6$y5+CbJAaxn%AYpRD@OkB_DNLrI@Eb_X z1j3#&=EoHP$D>OzV=2L%<`i!E$}_RyAgxOFF7i~VAWwCiZhGL{AJ(FIL{Y4)yr~fz zWKj_zq_Ll83|Xs*ZnO;drIS>OsuIT;G?hFDl%X-(i4l||9RH4)ViK5@t6^m)7u4n! zfSrfn93=kGp~(HqsFrw$c< zIq5=)2S9OGVNM=tr$&=%o8An$e^q&bt6on>bKu_KF_R`KKQ_n793^ar4<&{1Io8R& zPNsiYAxv;qHy)8jON+$w%0Q;lQIG$%h1n{%Nz3$4@~FasvxV#rQ$#7N^1%V54pbtkf17o0r*mK<(@Xu~Atc4gS_ zJOAUYwc-Op<%-ZVO93RtR-K}(SUip>7DsF$X!JR)JTqwwQK1~jM`+126`q0oDj`_& zV54LM(8wufkXgRsN`8W$6`RR(hjo*}j1vJUj4D=Rmc2ai%X+?Taq}3&6g)ntGs+@L zgwpk|Xn_lpJipVvT2|-zQvrKZlP(HNw*sgg9QrOYUvI(GYyU8)l>c!d5~DT@`U2U9 zi1VXCf;}HkLvIX+W-U7`7@bpESI$jlO58cnTh!+|pJXYulkkQj?sDOs;a+NS%1J}y zCOq%MEzYgK8uTv=NnF%<5`HS0(}-lj(n2UDj=na|;gm3Zy{61D3{aR!^+mE^3pg%5 zQLhC$yNup|r3P_3p3r(m4?jeeEBq=z5s+Tsz7tj(3+_@GVjnm#OY^yrZ@fx>Q^~l8 zcuZx_5NAm387CLgR0-cCS|`i;X~&mC|JUBoU{k^WmpoiywbCy&I(JYhZ0~qroYEXc zzAOH+Z;Sbg^yQj6ll?js&MR_>fJXsm9G}H8F7`Zn1-(p&)5QZjEo%(%PfTR6*7JDC zDUS>$;7W$vQh)+SL(71Y7Of3d~;d z_#qOq?akAQOIu-X$bbi}6&2xOhL&sa$-u zFWju;Z9g-l#E6Wl$WaY+`DN}WBw59jxRUg;389SEsiWWM^Rah)+-&JIASMIg@0afA zB0NiBWZYSm(nRbH%iFjBvv*sjE9VFmA|%Fn&K3BB_%3Y0MT-{6LXGR8=Zo;PG*v4IsAYS!)megohh0#! zyZI&dZ?h981uPWJx#DKfYEki@qej2RZGuhD zEaUa#3C1of?Icj8MZaM}mP3V2n_ke>KP3yPwiG5;l1vd415Qh!g zgkMyO_gd_E$9|y|g4wD9R_Sf@nm~IgaaH1GlU!e!V~^!pp}eCEB6h6`!2`K%z$_7L zRj{4aSA*BfY00}9HnQO>=;D!X?)Xq&Bx7wbtC;O<#$v@2S4lc>?9irJS{eC9?&!kc z)UqO*13<}ZsSm^!1dv#v^TcSpIc|Z`B|$LhOtC5Qb{a6?3zM(V=sm6pvsS?b21kE! zmAfN9O`Dg@xF@?8;|_+39UYWkA}}QlS?<>S=NeM#O9BOcw$`#i%hvK&RgAUqOyi$S zD{8_hW5*9Um5O2)KU6EWRsmBFU7E9=e5QIX>=(4S*jAUvy9x1=a84x;7%6n|1)O8Z zny~upOe;+pqF}2a=mW3esft|Ig1DX4gfY}s`$%f}$OtVL7o8)ja5TQX9hQe4`o9dlroxk{Z}AE`0x)5d&ZdxIvGOd-cR8RYr0T2y3{9gm)l980&;%L1(cpeRX1E3r zG;4TY#|8-f$ab0YKG6u7fh0|#vklG<80QHa>F*D}x#t4bW|hBu9Prti`8IK!{N&cs zUxVVq826`m!MRduir})-ZA<_pQ~`2!AkI(~)JfFjFWsY|M)n3fT=>zyVJDCUq${r! z!$ZkNKRQtBIG+B!K^Y12Xh}_eKGrN7&OcEhEAX~uN{HQvp;IL^QhV=_vb_pNUwBrQ zBW(yn)^hQy8GL9wZ~Or>=se_2Gvnvj4#ytO0lMGUaNTiK>ntyIH3i6{?kY=YZ+}pJ zM2~Dizhj^#oufK;LK~^8J-t<4O&cbH-qzDQs_aC@E~-)nsD@l`h0V&7FD&+V%n*CS z@|chKtZBO$`7c7YYVz0@Jky9TO*=>nl0d4k5P|OUr5h~dCzHIqdYGE-3eoDP55r#} z)!e-=$!3dqbcYJF-J@iZAnvt1)6O}EDd8{!m*iUD`AoM>_xC8tMk9s_J2Nvaq-mae znGuRrbl7t0xE_R6!NB*(rRxQixzqD>-!|u5cvMcxp|a!D;U0i-t+StE?c-U-PWuS# zMI1p@%B69W0YT^+faYi1c zCk8o3lKEPFO$l;A@_7r1r!Fojy`2xc*tN+LCWL=w&txCi<6^M!bi{~za$7`#hteO8 zy@LApp<*rpV49tcQLXn4pcWw&cO+xkCk}Hb(=~o_B}ptnuc2-Ot&W`abzE2BA>-D_`tW`#+%H&;s~oxJ=adYq5N~F5B-==b1PDeIw>Bap5Qs8J z@rb{?&=Bph(;X|sX=&y*SSUModepWn0{iZR)_f0tM}#esH$Lx46AjmK5n0G9FEW)` zK~Oykt5uE&q>d#lCVc9t#3`!6oZryxVcIGbM8i)TqWQHP7bQ0RW5S?O08pYtVCBE& z*y$K<<0|t~5a3dk$P~#wNOb<3GtX#)_H|*Z1jPp zp*TxGdYOSUp+Y6EYKIV1foNA|l!d=qdj@)Md}u~7r;ibSu`p| zgq5J|<5$~+R0~cXt1XbjMHahbAHXMal6PR7Hc9glOUO903Y^~NXp2X$98PtFl&Bz@!$@clJ13P{-4;(D zJ+{YE1tl6qMWIyEs6P>T?={EGOM}v14{)0S5rw>qIP`RN*qW`5j_?IbMlwaMMZQ{w zha4kwQUtb2wo{$JfNvUoo+&=?mGqrl=`B_s5?4uF!`u_+C>~KMzlszGu#0e{)WXB5 zLQ$36)+leK&r~iO9{zD%*W4=z1Q}3dfS*_jZw)**ulOs*VFF5cgrs5jLKW&sBy0Q;%1Xs3cJr0~tcSF+ z@WVs+3>Ck?0GCeV4LScTq%*ZSC$SYe3~3vpRG=GrdPiZ`l`SZx^2|IM%r6YlW3Fdq z4RM%2tO2c!R+$jcSaS8EK0M0i)r;4Z6v0|ql?Sz}Qg8`NRqmXeZgg>=4zxkbdRR#4 z#c+OyUf%-IrIU4J1IJyKlb20~h&DV^VE-eA7}f3slHdz=mR_z*I_bm!;lo%NmGY)Y z%7`9!uEc2>7|ZsM^+EI1(wVFYxRWx|pxTvB%7P+%4CIl9UgU-CArO^Eq+7_jD4~nR zT2xpJ4=1X-=>l zU&=2LdYLh^wSqhdmi^k}odxc%NlA?IN9AGVP|VCAy*X(Y4GiL}P#_mK_MvYDup{-j zyww1C8JM!%j#|3M1Q+;Kv9A)tE&S<8(x-+@Q+a=Xa5I=k09?~wk4XpR75oRrtR)cV zJJfMstEx(C=w6lZyHQ#eULbw$9!*je2M@$bC$K~YWqBzeX z94WB2T-R{^Fy?>r$0D&a3XEC^Fu-PC;6kz@k&!ATP-vDkicmZ~j@J9#%Ys+*+BX}z zu#Z{LF>j3ma82HuRP5|2;`F+z;&WNS%lI((=% z-I8yz^w7A|L>)#KB?ZwQ1{G3&G8YS<7KEU{Hy1*(?7;MCm>fNw{8Y zumxM$Yj7xhAkx1gq`gRu_}m!jHeVj2#T|AMufWDhYh_e~R-@dlMK)EpV+HY1d7R?X ztck+o5K?VorZjgWR&CSk{HqB}VR|%Ui8JUTapu8~axf}>0FxHY7$9)f#78p0bt)N- zGdnnVdU_mV8Whkrz)jYK#E)Qd;}}D|BQUQm14Jk2DjtQ>620VF*_u!J-^hx0Oxx4# zOy1_!O`TU=Kp=I-NmUj_PWani6X-!o3jLH_#QWF_iUUzQ$@`8G#st79 zBlS}zB5O)w8oSafiENe97C(>)wkG?=TN#?Cq7xFxT#nr2q_IMgm53UUe6#IL%?U>* zM?d-WR>r%S;Q6LV<*58~O!DIR9!yI?CS@-Z{XOaF=Q(7_(~y_c4IeO8cNKqA6EjvL zt7Y4fNg90|aNrf>51fv8lp3V}7~7_)jWS|yij%%; z44M^sVmWyT<7A1Iyc?!M&enJ_I>B*-f1n0dVZ~vivkv)pA={`ZcJlkX26?tl$Bq%rET?OQNRj4 zNB5%f}fx+=i?SNH+jtH$s$KBt`{ zMlrtDncDoB7M-6O$XMYd7}l0QK(#1Ymho(;qrW|p@g^$`_|dwKO@Aw?_Y_R~Z^x1f zosxIV=km#~sx&_)yo;$lIc;|ME5jYDpuJZ~WpyzF1@9JR2se;vpVgo(MW;fn&`*G& zrGiYvSVLbWip1^XN|Ke%1S>e}aaD;Zjf100-%)F2te@LWY7i-8eqkvU)2><^$K#L1 zDQ{p5Sy@pz=@38;=~YM)S+QvVGnp1>kPJB7yrG5iq*Q z>~NU8X=8|TEMKP+i~(NwXj)tnBvQk-Yb*R3s-1+&A>G39?odo(0>~f;!xMnga+T4e zru5J@7AJA=bi0;k+KEp3iAtvj$u#1#5ceqXX~+}Gb|lMq;ZGx~QsOa-5RO@Sp#q83 z#|w<5lR$nkAmkQ9)8FR8=Qm8jsZQ0xO>N={tkmgRMSf+&UqahJsIXCUeWvEWh?e6; zP(rqg~3YR^aZROxJWmNPJ;-rlYx8>h7+9E7@@siE*()1b;Dk*_FF-wP>i)Jcg#`(eMMV6|`pZ;qhR+T&ll}%XoN*e$LevM&gfujCR644Q#L=8 z3Ofo?hWYuRa;s%U7};rPOlwwVP2X5yaCrFB9mFT`PL_PB%30~%D0~*&R7AlF`{_Y* zL^=M3n(P(G5i?&Jr}U)`-HzVwek4CY5m!hises3J(n)IJh?dOLSe$_U^jS)@s7ml= zfl53-m-CDyX8#CR0QZ9U6U9{POPsu(9sTMD0u_%PdX9>=<<5_9nu^SM{uT5rd4Eto zOUQ=K0B)94vsYsxuwsOIll%IKG|Q`zBr6!^{5k-vQ28aF0kny`TIYOBMvKsIP`EG4 z{XHd~gY>wD{-7mCf*n5)HLN#m9)m`x3JrX6PL0A!aDYR_XXtU`4r= zCIpb9oDN_Y{%?QG05`-E9WR?{?ae;6Lou#9Fvycvc=CQK9TkO*V0Vx<4yI!Aim`r> zr1@tavJG&+%bKdB3rcX$3kT%PnX2?j)%Z@RyG>s z8Wu<(sQ#0vFkwO=OJ z$rl>5p5~+g0>Nvs7^$VmFeHYFPzw?7VeUfqAcl>JxMwg7k$Q5mG)~oUTX>Jo zuL%${JeMMhYR9yI+nk_%BdD{R^Fa_SI?Ftmi5Z%zCKwSN?Kr$@BaaGj;2 z3Xr{@aC&hpIv*L6(LDrjx(q-Ed2o)i+C4Q0s+a;7*ogEeiYc>>_sK?o#!7>H;kGIr z!w#zVBaO)_Or#~7L!GbAwKIk!ri-n=x_>FV7UY@{@`m+BXL$^5FLF?anrUU>BByjQdUrdom+r8*IYz=Yt|5E`HhSCNs|}BaJl4?j>}EKER9${M<5r9=#8=F zs;~oF4*J!l0;(B2X!vPdC)7O^8oSzP8ql#P9OKzZ40kmkAzotg;37)!YOnlN8L^ z$a)_OL<^fS7x6JS_&elpRn_#ciaZFBH_3}pJAUxK#}}%Vv)QTMIfU0^2Bo4iT@xt$ z?N5%Vu=j(MJu96*jYK=NT&{6`!a>#NA0z&8Z3$C57?(Xguk zz->VYIjvH}l~N~#NQoa0n?6;p$MfP74uXM`UhW|0v-~F-$b~SmsLF~VS-{>WqvL!= z#2iIF*@|HV@@zJ`gHj_RwZy(~Yf=lBIGISAn4FL|5@Awz4kV%NZ2_t-g+h!5B?2(y zJpSZq3fZRixG|r;702?H9cr1M|6(Yp$MSe2S`-2VTgz}H@=J=sB{_&?meuy6m0f8v zA&?LC6JAdvA?Q;(iv`J|{46D?)nFg;rzqTw%1vhC#a1#E)oQz)bxbm27?+a7 zaBg|?O9q!V?G5_`n=NOX4=&a`8!(y!7a`wuAd;KD`Yfrcj`dAVFdG z<9f6HL3qfdDzsYJbg_2vR}NYeCyvyE-#H{CKMBbY%aaPN5tSBThHDsv+)Znj+G9TC$Sjle?cW74Y)o z=QWhGX{hj*@H9qCk_hRdQ@N@puzT?Ed?p9R$Y_P5JiA|oIUO!PPuuy|Z50~Js_u-$ z9#GU((UBdQA`xu`jKH9c9f{qSOtFyWKw~vDe9{C`N-$wsp70V+zSRNd^`U5{?Lu*= zpAMjLop`Cc?FJ}%5VZ6i6Uox$L0He&#byTU!9_;|+sdIUVbfwh(x+@=$a@+z!AeT# zEtH-B2x3Y-7wBOtmJaz)i)rqaiR=MXoU6s}kjs&MsVHNW&@hA8I{H15MTD&O;&{ti zp04JdUn}COg|Ijf+OR`61PC9*cy~G;k<}fg6jAgn;Ex5PMHObX{!8em$HI4PB_*93 zHlf=0qK!uRB92*$5223WPubbt=f_AB)9)Dwql#Zn4C7#4-QO9JW^N>&Bo0^xxSTqyZ*f8MA(RJ?2ZwK1OALWndgoMAk(2Rwkdmi4&jvZX^$r zAoi)OlP&-sZN*H6`d5+30G&>3nRK3s%uU6&Z2~)-ewxLmpaL96!hKmv?uts6Fd`?s zwfB*wp^L=Ra}m31e`D+)#X{n+E#?UirRb{EwGYJpF$I;ENg7we8DT)#>J}`<+GDOR~Z}M25X=YL! zjQ2Ld{M2u(Q`$0I3A^%AExDz~1>c{Hqq|j>r|3D=`^4*HyTK9p#%kK%;BZ;0&B~4S z!?8n1k6|;fCDXgx9)vXQ5|$anYy2C&nDk{9=KNAnZf(uoGaFV``0_%kGt z2-vEiTZ#H+XWlmbtBmPT}a7wKCCUm0pP9)p$&dy?M2oY_ssQddUUjPXOLbTiX{~7YoE}Clc3_o=Wm& zlM&cr(H|0@&_hOYs3+Y)kPlVR3$a`TQ=!y5CX#6M3+a2QP&3H1Ki?A{G^xQMl0mj) z9S>MbncPJ#DtAOPg@CA$>wcCvKdREBzJc+*S{Y$WhUG~duMZ_FbffptY%@C}z%u-d zGT1Yh3)ad5;>hY473z8XO$$BZhrzk*3_MABn3)@Gp=)B<+0hroCBg`#$mtVW1qapC zC+?Cwx>Qq8&KBiFe$Nn-K%#6y-VpXYFt#-xl7I+0(yZLVFBF@d2ShT7&~hWHp={&R z4AIAOvYm6AadJ%|+SSd6h0Gw{l?ov#AJtA@42eOZ%gP;*E;M|K{l3oKNASg4HCcYr zoQM)`S#Z}KJ!4E784^qScOc(s$&w(8!b*4g4?jtjfsRdJoq^~|zUs~<%7dn!V_(T7 z4ZHZ!diTCHv&mQpb1! zfnmoeIx7Ql0pGX15sr-F8Yla>v_#*+9Ho`om~LqZ6i(*yp<3J27)Sn&JG_rAU_X9( zf|VOA*A#m4#b{1~5-`GoujU7$N><;VjtH?l%3?niHYJi>DzP^mEw*&%9(qQQCANrP zH0(piTb3!gIHLKjIT%+I(Uga#0C9ImEuzeiJs!>5QtVBCB!;r>pJPF*a&2w;vGwlx zVCP)VASZ!Lr7tI{z?9^>YRMutAp6hl=7y)R2CkEtgIe@QE+W5QRZ5@Z41b9!B5|Gv^L z72MduT|e@Tg(B2d ze_58)7rV@x?MPfVq0uB}^GU=G<9;E(I1*dluqbjif*+A5on8F%y&l4H3pz81HM%6> zcv22>e;mDn!wEo=`?HUEh}Wz%wNn;4CJ~t<#ch`Skwa_@{Z@PL^5?GF4Kseno{+yS?KRaI;Hk(C_FbEGx$ zS!bBZjuO#l*eJb=NiR;J;E__{6IvcrSt^Hd@JjPGao2+TaSjYV|8)%P=rvqor zSNf2@wE@9!VzD>KL!RFo5_i#TwYH4%qoY%vq1*Ko^(l^S;^g5sU1SyYph4mRFsq7d zgo+e?7{|A^Hyl8hg^Vds71oqAgb4@njU9xYnT|QM6Y<>88QJlqbA0>%0j`4374C`w zk<_J&AIv$5h=SUl^heiDrcTXi)k2+G0=%kQi3>O)ifI}{-Z9hfqH+03!&9yp7Qg41 z7_&8uAK}ONq1)OK7>g4XhFEFM;=iexKR;G1LU1oP$BL14(LidHCoNUi&?pRem_+=d z6xISZkPK4^`I#u*lXqfI^u#8xSj*qg*|WPewP8cZ@lJ%G)K*pnc3iy%ndLJ-Y&(wk zcvqMLd4O+#jO2OQk6QGO&l0!NUx{r`C>f-qvsAf$X28`=W?sY|KzM%@W&o1gD%-Hm zx*!ly=~grHEOSQ8nrJeNlW{p74WI3Y&8KM_Sqym&99;@B!49Yx?LOST!iN%${FtbR zh!<&69TxI68@U)SSA>A-db=CF;!idP@VkbvYjOvx`TYsxav0-GeoOR&a4wFvlF0cC zdeE%=z`YF7#EedSbfjZF^a_>aukP#uaR#58fwI5bpD*u5!Ys(IOR}DFq2&Ee$jVSt zQ&d(r9S<*x-|1rs=k~oww%GV>e#9@tDCY+twkGuCD~2%zTyM)CeZ_%`Kj25G`kZx7 z>9aKqOoe~ABaSt~Yq4abfv@b8MR9UCvzptZZ+XvOH5sMutz@Bt6r`WdaQx06RtX)s zra4Up5}&keHn|$dc250Ya0vh?z5{^5@;?C-zW?ic%#r_>Ba3CLCUd~|=S`SK{(mAI ztV?J5XcErNY~`DwcPsq*^>Z`)s1KP|=E5ImwhpW~=T+(^pZlIxrq^!B|znz%zh`e+kBO=zV{zFIL`oBGY>IH!>tAmg+S8rIk0|kG`Hz{#W}EXDIkE$a*0+EgJRb`sEzE|G#7}~%FaG~X z4eb~@&pKN_V4nDr^_6+Hg$bX}3tf^y=d1tAXINWuEGKf~v=%f&W^VMh^|St=GVFQZ zr&sJ-r!srRt(}>FGVahfTY3Rwk?~>t`#*!q@WJLEHJ~yiq03?{IFi8lL;4k zC!J~kSSwQ*bjg>74Lm7R8TtfVuehL7-gi+vnT$*>8`1Ggz*@-k}Vr)@Lfk?s|v9w0vo%Ch_9Z zZX_8b0e$N(a~L00hH)9#41<ibx@^!ZIgy?@nYl5j ztoDjM@XhDF5>8yCz3r!erXVuBt?SUi<)hA9^xMP%d1>Ca{Q7qBHeE>h^3RLPeo>0> zi~YKroEQ5KuwGKCbB=8Oht1G0EBw-+L4%zp^>}>D!NZ)%zUsHCJ(qg5cW-GNGOEV3 zeCP{q(-2%w+vQ<}%QDr`KsBuFFmC#a%fqD|2l$bthcBHN`S?TAs!>(vm9Iu@o;hIE z=t)h3v&ZcE)^ujfv|pNsCcXILr7c|4%;aM|!6IlrJ$8->fQA{TW)I6>AU3}=c;Wox ztH&VQW43QTR&~zdFXI>@fn&%TI=bf4;Gr98u1L?B45R$#_tc>Tn2`ty%q6%V3e! zl~~8CD?f{FSe^e_w06ADXXAG?Ul}Ws^xtmH_HB8|REcx+Eic(oV(qnkDPp~8iiCf2 zz8+kAO?v&SdUO72&)mSutDacP3{`&O{iSrvywI0>=Fg8SzMS;%(}E@G2Y(J->ev(d z&FH;7#Vr4fBkp5MyOaK}E$d-h^G)a6EqlZ4y;GjfdbMw6K!bBlePr|UK|@^4E8dj; zzw)jPDDILjGC0T3KmRDMHKO5 zlJs8Ilto&El@?IYa}V@#gtLXoxwg`+ zE6;7a|5=b_SoOjPC_jLd!uHj#Lyml~Pd;-;<&f?^G?^5Ec=)dkGoM}$QWaFs-- z@0{1Ss`G`n6<_6b&oh3t|7FGPom-vXm3L}i3x^+g;q{3FvtPV?<@{_zjsB%~)1L}m zn%&i__(-B?-!i^TdwG41rF$@K1L#G(@V>hEkag7Z+~uBQU+;hSX62-4gzL@?bLKPz zwy%D5|H?P!y!OeXKh5bG-)dNM&m&J}TMrgLnJ!>BW5e)s7a*X0z6<0dX73+eqxojh z_=Ptv4$Xe?%}a-nhQ#s3555J$3uuxMlIVO;`;+1w5R&+hCXW-7SuVyYZG|oRY^>@~ zF9A7c$kttb%AAk8`r%tnl><+?nL~ZaxB0pUwJUaX4{6fK8R=^$g(S3wrmr{6@7(mY z1k@z*-WxIR-GAwjWyXn@MlZj~tUYXh?_-G~_f*B@BaYp%1Mk`|9A5k0&3}9I-5Ukj zrf@<%BC{PNH6AFMv8np_KJ#1EeDU{1Mei?}uRi_0=axC&omh6q>f0S=@5Z?&z4!h3 z6(k{%n0$BAxAw@~Q!shx)IA4Y{pOUvZrZD-)ho!PQqYU}uzcsHb?a7kzBTWo)o&yO zCEkh?YyWf+2X%^#pfGb{_4l8w|NLKHlTH8Pt?$>o@$Hd^KYjdf|2}_A{Ar@;cVQ%$ z5}CThF&<4`vVNTVoY2;$KMsNtkzH3On&(?VYDHmg|LlqU=?CR!zdKoT=3j2R`Pgme zbC(`Go3?A+f^*yM==|Z_j)yP&@VO&A{q-+)&vgCxz$;tUJn`k;hdaWGee-v_&JXS5 z+Z4~g`1XxojqE%6dgBYH!ntNpn5a(f9#thIAgn3&Vo-vB2l!eqN@jOLMrsu#KFMHt zNf~^crK}WbR*@-BlfS-YkWKF6=wB)%?gb9x7hj&d6~IZsjFBV{ScH+{bd0P7kZO#? zjhJMb-;YhlP1qEYCC@z(;<2eD72>)c8PnkKu$HA$F`10zSxi=k?sbzmhmlk582KkqdB7#h@Ks16qm4@0K^3Re;>bkJ zM12a_1qToSDFE3C*$u3TNdR(~5f?vG!XnDRMnVN?fFuB9SYVJOjwJ(-@}M3t1E2#y z;}|lW0)R3&QQrMLKqkNr$ObF`Z~!xa1;FbB7_xdEU?u?4zp?269xw}#0muS?egp>3 zOtCqDTz~~I4IltOLjubOqyvfp@ah!i1N;%-0^9&713=CohEyfM==h2q3MjiSb72>WTGE}X#%8Z|v4xq&sQoBA?Z+>Etu^z7zg zle(c`qNLkvkw>2mva~dP%H9JQ$I|)6mTsGR`oc3gcuj%g_eO))WN9ccCaH6aOn(?D zXb7nrn#PtvCQU9;Gf8EV&)PbAwu+-w$=k(rYe{9Cp&lRIJUHm``c*CA zg&EMLni_N|C^%bt;!?y}xz~ylnXKDFw7U2p&C|q@(!o{P+%(go6@9@TcS-{Zx2VM} z3m3T>Ly&BhbpyL27dzFt6ifk@g!829oke_?kD_`FFLV2Cw&OF0Vn^W{>%gfdm zgl%0Gjijz1(0$t2m|vLd9q~pI^*|q7vK1}yP%1HFS9qS$C)4V@Qn<^$cdg%r;&b+8oC6yNOd>cc|24b0aV&#?7)t*wiBwuiEc=u{aM z_=mcOl8XJX(WesQxQeleqZ!V1S4?;su8hPpf;w=@Q9lco6e*E!NwKwjFdp2@n#je* z7Kcf#k?Sazp+)ekRZaDKJbRe7u(>MD?BH*0>=H~?u!3mRnA$|cX19qV!$M2X4oi58 zROkY?DiHQmmfB`6d;ZeCm~GSWIN!H}jV70j#~cb?PDTdK#=wqN`P6zU;at}&x~9kX z`@MGO^6u7ICoc`Gvu;(?EZSCV-Ky;S>brF}efory`B80_XQ|SMOBOWHGb83ImR7VX z4OIvAFdS-lGo72-J66^0RkbSn;Wa{4>*DbcyFznHL7!E|V47PkR_$OGc$0-}>UEm+ zs+#^0F>a$I#Rhz%CKD&Ysc}UCi%*3?7}R-H(vQwoCDAIb=AoK3l-n(l1syP|ddI>^ zK1wSJL1@=ZXj_#{e#hM^$sqohi~pq3iepvFZFX(E?^ZvrZ>ll1nClC)eI2o&#iCkb zaK=rvE-7SEf0%2W(s&yV9vjD@Fifj9{f&|Clt-iU}0As_;A0)7X`0H7d`0y-iQL=rXtA_+u3h;#(NuZaSY3lBhK zvjl+1;$A=@;0{0z01*x%tujD40FenIDnuSY9B*q7yUfARRfgtSh(iezHdVXrn;f1uu_;sDf;1F zPbC|atX;{bN(ynYhgGx=oV9TlMf0dX#QCU5Z(rNF%*|FRfvuwcRYw(5x{TLXwHv<5_QJ)1y2uPh&r ztL;8+o68wEU16Oi5m|*u*Sf|x<8ug{D)HMGq83ILG)an>NKG}2j9avrE>b*s5BO*wLmekLtMQ65~lC%H~>ygkp|@rOv*tDioFYb-Ta z%-en>^T3A{Osn!-RTgb+#CLD8?Xf;x^!425hRbv1yMs23hIf{CdqWz9dqQAO*c$P% z1Mb@^YJ0Es0!4U6D)ym3uTc!5jjFoDK1F@+gq?%U5r1;f#60yL{_=+vik(mNchMB9 z94HF``_a+%SQ5=j*j=oRq3MF&pp&7tU_`)7vGw_xu`G%b<3zTXvWtiL*a)SFIC*xM zvez7LhczNnoC#}exR{T@_B2`y3jE78_U=KJ|iX})f5^v`iN=>9hka1vGBxV{i zVI1U-HttOIP51m;8E}<4U2%59vB#fjmF%@|TbjXL)M`|9_KAu==IQ3?1`;M#lUbs3 z_*+(^n$nM7*#_}`vEXl~iJCaidMz})$=}5?nn+>0O*COmPTucj={m^(?+r;KgN7s* zBg}9a*!3QjHsWIa8gn>S9=d9wTgCQ}4*IM73~%~b?;wdvIYmpH9l@&e~|b5EZF)6JonoO-0823 zdovY9d>osrX%yRew~!ij_1YW)-97}v+Cp*A3sNEmAW%>mVhT9I0k2*vf2Z&fN-j#O zGuD?r#)MrR9-gAYE+>}6PU~lS=!^+f?!;(xo@Bg)Iapqr#CD&~vfPrT&rw0x0 zya*B+zv({6fvx@f{f8^?zpsAxMvPlHY4$m*;prMYFLB`eD3FR Date: Wed, 24 Jan 2018 11:31:04 -0300 Subject: [PATCH 41/55] added test cases for negex dataset --- .../ml/logreg/NegexDatasetLogRegTest.scala | 71 +++++++++++++++ .../ml/logreg/NegexDatasetPipelineTest.scala | 87 +++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala create mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala new file mode 100644 index 00000000000000..49468d51b8b34b --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala @@ -0,0 +1,71 @@ +package com.johnsnowlabs.ml.logreg + +import java.io.File + +import com.johnsnowlabs.ml.common.EvaluationMetrics +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{SimpleTokenizer, Tokenizer, Windowing} +import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{DataFrame, SparkSession} + +/** + * Test on simple dataset from NegEx + * Created by jose on 22/01/18. + */ +object NegexDatasetLogRegTest extends App with Windowing with EvaluationMetrics { + + override val before: Int = 10 + override val after: Int = 10 + override val tokenizer: Tokenizer = new SimpleTokenizer + + /* local Spark for test */ + implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() + import spark.implicits._ + val datasetPath = "rsAnnotations-1-120-random.txt" + + val embeddingsDims = 200 + val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val fileDb = embeddingsFile + ".db" + + override lazy val wordVectors: Option[WordEmbeddings] = Option(embeddingsFile).map { + wordEmbeddingsFile => + require(new File(embeddingsFile).exists()) + val fileDb = wordEmbeddingsFile + ".db" + if (!new File(fileDb).exists()) + WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) + }.filter(_ => new File(fileDb).exists()) + .map(_ => WordEmbeddings(fileDb, embeddingsDims)) + + val mappings = Map("Affirmed" -> 0.0, "Negated" -> 1.0) + val reader = new NegexDatasetReader() + + val ds = reader.readDataframe(datasetPath) + .withColumn("features", applyWindowUdf($"sentence", $"target", $"start", $"end")) + .withColumn("label", labelToNumber($"label")) + .select($"features", $"label").cache() + + // Split the data into training and test sets (30% held out for testing). + val Array(trainingData, testData) = ds.randomSplit(Array(0.7, 0.3)) + + val model = train(trainingData) + val result = model.transform(testData) + + val pred = result.map(r => r.getAs[Double]("prediction")).collect + val gold = result.map(r => r.getAs[Double]("label")).collect + + println(calcStat(pred, gold)) + println(confusionMatrix(pred, gold)) + + def train(dataFrame: DataFrame) = { + import spark.implicits._ + val lr = new LogisticRegression() + .setMaxIter(8) + .setRegParam(0.01) + .setElasticNetParam(0.8) + lr.fit(dataFrame) + } + + def labelToNumber = udf { label:String => mappings.get(label)} + +} \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala new file mode 100644 index 00000000000000..1b1bd36ea1ed35 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala @@ -0,0 +1,87 @@ +package com.johnsnowlabs.ml.logreg + +import com.johnsnowlabs.ml.common.EvaluationMetrics +import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach +import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat +import com.johnsnowlabs.nlp.{Annotation, DocumentAssembler} +import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +object NegexDatasetPipelineTest extends App with EvaluationMetrics { + + implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[1]").getOrCreate + import spark.implicits._ + + // directory of the i2b2 dataset + val i2b2Dir = "/home/jose/Downloads/i2b2" + + // word embeddings location + val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val datasetPath = "rsAnnotations-1-120-random.txt" + val embeddingsDims = 200 + + val reader = new NegexDatasetReader() + + val dataset = "rsAnnotations-1-120-random.txt" + + val ds = reader.readDataframe(datasetPath).cache() + + // Split the data into training and test sets (30% held out for testing). + val Array(trainingData, testData) = ds.randomSplit(Array(0.7, 0.3)) + + val model = trainAssertionModel(trainingData) + var result = testAssertionModel(testData, model) + + var pred = result.select($"assertion").collect.map(row => Annotation(row.getAs[Seq[Row]]("assertion").head).result) + var gold = result.select($"label").collect.map(_.getAs[String]("label")) + + println(calcStat(pred, gold)) + println(confusionMatrix(pred, gold)) + + /* test serialization */ + val modelName = "assertion_model" + model.write.overwrite().save(modelName) + val readModel = PipelineModel.read.load(modelName) + + result = testAssertionModel(testData, readModel) + pred = result.select($"assertion").collect.map(row => Annotation(row.getAs[Seq[Row]]("assertion").head).result) + gold = result.select($"label").collect.map(_.getAs[String]("label")) + + println(calcStat(pred, gold)) + println(confusionMatrix(pred, gold)) + + def getAssertionStages(): Array[_ <: PipelineStage] = { + + val documentAssembler = new DocumentAssembler() + .setInputCol("sentence") + .setOutputCol("document") + + val assertionStatus = new AssertionLogRegApproach() + .setLabelCol("label") + .setInputCols("document") + .setOutputCol("assertion") + .setBefore(11) + .setAfter(13) + .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary) + + Array(documentAssembler, + assertionStatus) + } + + def trainAssertionModel(dataset: DataFrame): PipelineModel = { + + System.out.println("Start fitting") + + // train Assertion Status + val pipeline = new Pipeline() + .setStages(getAssertionStages) + + pipeline.fit(dataset) + } + + def testAssertionModel(dataset: DataFrame, model: PipelineModel) = { + System.out.println("Test Dataset Reading") + model.transform(dataset) + } + +} From fc617de82ce385ab9bef0544c6b8567c702bbc27 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 15:51:55 -0300 Subject: [PATCH 42/55] fixed problem with parameters --- .../logreg/AssertionLogRegModel.scala | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index ff28830c552fdc..886461d9c7156d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -7,7 +7,7 @@ import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.hadoop.fs.Path -import org.apache.spark.ml.param.Param +import org.apache.spark.ml.param.{IntParam, Param} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions.udf @@ -23,22 +23,23 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS extends DatasetAnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings[AssertionLogRegModel] with Windowing { - /* remove these Params */ - val beforeParam = new Param[Int](this, "beforeParam", "Length of the context before the target") - val afterParam = new Param[Int](this, "afterParam", "Length of the context after the target") + override val tokenizer: Tokenizer = new SimpleTokenizer + override val annotatorType: AnnotatorType = ASSERTION + override val requiredAnnotatorTypes = Array(DOCUMENT) + override lazy val wordVectors: Option[WordEmbeddings] = embeddings + + val beforeParam = new IntParam(this, "beforeParam", "Length of the context before the target") + val afterParam = new IntParam(this, "afterParam", "Length of the context after the target") override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) setDefault( - beforeParam -> 11, - afterParam -> 13 - ) + beforeParam -> 11, + afterParam -> 13 + ) def setBefore(before: Int) = set(beforeParam, before) def setAfter(after: Int) = set(afterParam, after) - override val tokenizer: Tokenizer = new SimpleTokenizer - override val annotatorType: AnnotatorType = ASSERTION - override val requiredAnnotatorTypes = Array(DOCUMENT) override final def transform(dataset: Dataset[_]): DataFrame = { require(validate(dataset.schema), s"Missing annotators in pipeline. Make sure the following are present: " + @@ -70,8 +71,6 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations - override lazy val wordVectors: Option[WordEmbeddings] = embeddings - var model: Option[LogisticRegressionModel] = None var labelMap: Option[Map[Double, String]] = None From 748b1e601b8be95ad460d568532326b531e2951e Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 16:22:49 -0300 Subject: [PATCH 43/55] removed some hard-coded params --- .../logreg/AssertionLogRegApproach.scala | 18 ++++++++++++--- .../logreg/AssertionLogRegModel.scala | 23 +++++++++++++++---- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 0dffafce0bb2db..468de659383589 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -40,6 +40,10 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac val beforeParam = new IntParam(this, "beforeParam", "Length of the context before the target") val afterParam = new IntParam(this, "afterParam", "Length of the context after the target") + val startParam = new Param[String](this, "afterParam", "Column that contains the token number for the start of the target") + val endParam = new Param[String](this, "afterParam", "Column that contains the token number for the end of the target") + + def setLabelCol(label: String) = set(label, label) def setDocumentCol(document: String) = set(document, document) def setTargetCol(target: String) = set(target, target) @@ -48,6 +52,8 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac def setEnet(enet: Double) = set(eNetParam, enet) def setBefore(b: Int) = set(beforeParam, b) def setAfter(a: Int) = set(afterParam, a) + def setStart(start: String) = set(startParam, start) + def setEnd(end: String) = set(endParam, end) setDefault(label -> "label", document -> "document", @@ -56,7 +62,9 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac regParam -> 0.00192, eNetParam -> 0.9, beforeParam -> 10, - afterParam -> 10 + afterParam -> 10, + startParam -> "start", + endParam -> "end" ) /* send this to common place */ @@ -69,8 +77,12 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("text", extractTextUdf($"document")). - withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) + withColumn("text", extractTextUdf(col(getInputCols.head))). + withColumn("features", applyWindowUdf($"text", + col(getOrDefault(target)), + col(getOrDefault(startParam)), + col(getOrDefault(endParam)))) + val lr = new LogisticRegression() .setMaxIter(getOrDefault(maxIter)) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 886461d9c7156d..b127c30709ad17 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -5,11 +5,11 @@ import com.johnsnowlabs.nlp.{Annotation, DatasetAnnotatorModel} import com.johnsnowlabs.nlp.embeddings.{ModelWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.hadoop.fs.Path import org.apache.spark.ml.param.{IntParam, Param} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.functions._ import scala.collection.immutable.Map import scala.collection.mutable @@ -30,6 +30,15 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS val beforeParam = new IntParam(this, "beforeParam", "Length of the context before the target") val afterParam = new IntParam(this, "afterParam", "Length of the context after the target") + + // the document where we're extracting the assertion + val document = new Param[String](this, "document", "Column with the text to be analyzed") + // the target term, that must appear capitalized in the document, e.g., 'diabetes' + val target = new Param[String](this, "target", "Column with the target to analyze") + val startParam = new Param[String](this, "afterParam", "Column that contains the token number for the start of the target") + val endParam = new Param[String](this, "afterParam", "Column that contains the token number for the end of the target") + + override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) setDefault( @@ -39,6 +48,9 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS def setBefore(before: Int) = set(beforeParam, before) def setAfter(after: Int) = set(afterParam, after) + def setStart(start: String) = set(startParam, start) + def setEnd(end: String) = set(endParam, end) + def setTargetCol(target: String) = set(target, target) override final def transform(dataset: Dataset[_]): DataFrame = { @@ -50,8 +62,11 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("text", extractTextUdf($"document")). - withColumn("features", applyWindowUdf($"text", $"target", $"start", $"end")) + withColumn("text", extractTextUdf(col(getOrDefault(document)))). + withColumn("features", applyWindowUdf($"text", + col(getOrDefault(target)), + col(getOrDefault(startParam)), + col(getOrDefault(endParam)))) model.get.transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) } From 9512e451eba3064a7703c5b2533bc2b2217138a3 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 24 Jan 2018 17:09:42 -0300 Subject: [PATCH 44/55] minor changes in parameters --- .../example/logreg-assertion/assertion.ipynb | 50 +++++++++++-------- python/sparknlp/annotator.py | 3 +- .../logreg/AssertionLogRegApproach.scala | 17 ++++--- .../logreg/AssertionLogRegModel.scala | 5 +- .../ml/logreg/NegexDatasetPipelineTest.scala | 6 +-- 5 files changed, 45 insertions(+), 36 deletions(-) diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb index b57d3b4168bc37..3cf8d61f8d8e18 100644 --- a/python/example/logreg-assertion/assertion.ipynb +++ b/python/example/logreg-assertion/assertion.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": { "scrolled": false }, @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -193,44 +193,50 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, + "outputs": [], + "source": [ + "pipeline.write().overwrite().save(\"./assertion_pipeline\")\n", + "model.write().overwrite().save(\"./assertion_model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": false + }, "outputs": [ { "ename": "Py4JJavaError", - "evalue": "An error occurred while calling o286.save.\n: scala.NotImplementedError: The default jsonEncode only supports string and vector. org.apache.spark.ml.param.Param must override jsonEncode for java.lang.Integer.\n\tat org.apache.spark.ml.param.Param.jsonEncode(params.scala:98)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:296)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:295)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.AbstractTraversable.map(Traversable.scala:104)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.getMetadataToSave(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:277)\n\tat org.apache.spark.ml.util.DefaultParamsWriter.saveImpl(ReadWrite.scala:250)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:254)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:253)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.saveImpl(Pipeline.scala:253)\n\tat org.apache.spark.ml.Pipeline$PipelineWriter.saveImpl(Pipeline.scala:205)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", + "evalue": "An error occurred while calling o208.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./ner_pipeline\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./ner_model\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/ml/util.pyc\u001b[0m in \u001b[0;36msave\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"path should be a basestring, got type %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0moverwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPipelineModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPipeline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mPipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./assertion_pipeline\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msameModel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPipelineModel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./assertion_model\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/ml/util.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"path should be a basestring, got type %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m \u001b[0mjava_obj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 198\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_clazz\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"_from_java\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m raise NotImplementedError(\"This Java ML type cannot be loaded into Python currently: %r\"\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1131\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1132\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1133\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1134\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/sql/utils.pyc\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/protocol.pyc\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 317\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m raise Py4JError(\n", - "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o286.save.\n: scala.NotImplementedError: The default jsonEncode only supports string and vector. org.apache.spark.ml.param.Param must override jsonEncode for java.lang.Integer.\n\tat org.apache.spark.ml.param.Param.jsonEncode(params.scala:98)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:296)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1$$anonfun$2.apply(ReadWrite.scala:295)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.AbstractTraversable.map(Traversable.scala:104)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$$anonfun$1.apply(ReadWrite.scala:295)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.getMetadataToSave(ReadWrite.scala:295)\n\tat org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:277)\n\tat org.apache.spark.ml.util.DefaultParamsWriter.saveImpl(ReadWrite.scala:250)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:254)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$saveImpl$1.apply(Pipeline.scala:253)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.saveImpl(Pipeline.scala:253)\n\tat org.apache.spark.ml.Pipeline$PipelineWriter.saveImpl(Pipeline.scala:205)\n\tat org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:114)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" + "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o208.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" ] } ], - "source": [ - "pipeline.write().overwrite().save(\"./assertion_pipeline\")\n", - "model.write().overwrite().save(\"./assertion_model\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], "source": [ "from pyspark.ml import PipelineModel, Pipeline\n", "\n", "Pipeline.read().load(\"./assertion_pipeline\")\n", "sameModel = PipelineModel.read().load(\"./assertion_model\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index 55a50fa10bc4a8..d219de7961384d 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -550,7 +550,8 @@ class AssertionLogRegApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, Ann eNetParam = Param(Params._dummy(), "eNetParam", "Elastic net parameter", TypeConverters.toFloat) beforeParam = Param(Params._dummy(), "beforeParam", "Length of the context before the target", TypeConverters.toInt) afterParam = Param(Params._dummy(), "afterParam", "Length of the context after the target", TypeConverters.toInt) - + startParam = Param(Params._dummy(), "startParam", "Column that contains the token number for the start of the target", typeConverter=TypeConverters.toString) + endParam = Param(Params._dummy(), "endParam", "Column that contains the token number for the end of the target", typeConverter=TypeConverters.toString) def setLabelCol(self, label): self._set(label = label) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 468de659383589..c665532533ccd9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -5,7 +5,7 @@ import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.param.{IntParam, Param} +import org.apache.spark.ml.param.{DoubleParam, IntParam, Param} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ @@ -34,14 +34,14 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac val document = new Param[String](this, "document", "Column with the text to be analyzed") // the target term, that must appear capitalized in the document, e.g., 'diabetes' val target = new Param[String](this, "target", "Column with the target to analyze") - val maxIter = new Param[Int](this, "maxIter", "Max number of iterations for algorithm") - val regParam = new Param[Double](this, "regParam", "Regularization parameter") - val eNetParam = new Param[Double](this, "eNetParam", "Elastic net parameter") + val maxIter = new IntParam(this, "maxIter", "Max number of iterations for algorithm") + val regParam = new DoubleParam(this, "regParam", "Regularization parameter") + val eNetParam = new DoubleParam(this, "eNetParam", "Elastic net parameter") val beforeParam = new IntParam(this, "beforeParam", "Length of the context before the target") val afterParam = new IntParam(this, "afterParam", "Length of the context after the target") - val startParam = new Param[String](this, "afterParam", "Column that contains the token number for the start of the target") - val endParam = new Param[String](this, "afterParam", "Column that contains the token number for the end of the target") + val startParam = new Param[String](this, "startParam", "Column that contains the token number for the start of the target") + val endParam = new Param[String](this, "endParam", "Column that contains the token number for the end of the target") def setLabelCol(label: String) = set(label, label) @@ -83,7 +83,6 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac col(getOrDefault(startParam)), col(getOrDefault(endParam)))) - val lr = new LogisticRegression() .setMaxIter(getOrDefault(maxIter)) .setRegParam(getOrDefault(regParam)) @@ -102,6 +101,10 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac AssertionLogRegModel() .setBefore(getOrDefault(beforeParam)) .setAfter(getOrDefault(afterParam)) + .setInputCols(getOrDefault(inputCols)) + .setTargetCol(getOrDefault(target)) + .setStart(getOrDefault(startParam)) + .setEnd(getOrDefault(endParam)) .setLabelMap(labelMappings) .setModel(lr.fit(processedWithLabel)) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index b127c30709ad17..ec0e886bf56560 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -35,9 +35,8 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS val document = new Param[String](this, "document", "Column with the text to be analyzed") // the target term, that must appear capitalized in the document, e.g., 'diabetes' val target = new Param[String](this, "target", "Column with the target to analyze") - val startParam = new Param[String](this, "afterParam", "Column that contains the token number for the start of the target") - val endParam = new Param[String](this, "afterParam", "Column that contains the token number for the end of the target") - + val startParam = new Param[String](this, "startParam", "Column that contains the token number for the start of the target") + val endParam = new Param[String](this, "endParam", "Column that contains the token number for the end of the target") override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala index 1b1bd36ea1ed35..49e78c759da9fa 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala @@ -24,13 +24,13 @@ object NegexDatasetPipelineTest extends App with EvaluationMetrics { val dataset = "rsAnnotations-1-120-random.txt" - val ds = reader.readDataframe(datasetPath).cache() + val ds = reader.readDataframe(datasetPath).cache // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = ds.randomSplit(Array(0.7, 0.3)) - val model = trainAssertionModel(trainingData) - var result = testAssertionModel(testData, model) + val model = trainAssertionModel(trainingData.cache) + var result = testAssertionModel(testData.cache, model) var pred = result.select($"assertion").collect.map(row => Annotation(row.getAs[Seq[Row]]("assertion").head).result) var gold = result.select($"label").collect.map(_.getAs[String]("label")) From c207608ee71901ad56191b839729feb75fe2baac Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 25 Jan 2018 11:19:04 -0300 Subject: [PATCH 45/55] refactor to avoid embeddings serialization --- .../com/johnsnowlabs/nlp/AnnotatorModel.scala | 15 ++------- .../johnsnowlabs/nlp/HasWordEmbeddings.scala | 24 +++++++++----- .../logreg/AssertionLogRegApproach.scala | 19 ++++++----- .../logreg/AssertionLogRegModel.scala | 32 ++++++++++--------- .../assertion/logreg/Windowing.scala | 4 +-- .../ApproachWithWordEmbeddings.scala | 11 +++++-- 6 files changed, 57 insertions(+), 48 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 609894aa7b8fd1..21c9841d7e77de 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -16,7 +16,8 @@ abstract class AnnotatorModel[M <: Model[M]] with ParamsAndFeaturesWritable with HasAnnotatorType with HasInputAnnotationCols - with HasOutputAnnotationCol { + with HasOutputAnnotationCol + with TransformModelSchema { /** * internal types to show Rows as a relevant StructType @@ -40,18 +41,6 @@ abstract class AnnotatorModel[M <: Model[M]] annotate(annotatorProperties.flatMap(_.map(Annotation(_)))) } - /** Shape of annotations at output */ - private def outputDataType: DataType = ArrayType(Annotation.dataType) - - /** requirement for pipeline transformation validation. It is called on fit() */ - override final def transformSchema(schema: StructType): StructType = { - val metadataBuilder: MetadataBuilder = new MetadataBuilder() - metadataBuilder.putString("annotatorType", annotatorType) - val outputFields = schema.fields :+ - StructField(getOutputCol, outputDataType, nullable = false, metadataBuilder.build) - StructType(outputFields) - } - /** * Given requirements are met, this applies ML transformation within a Pipeline or stand-alone diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala index 114bfc7aa40bf3..6bd46f3f3b8a5e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala @@ -25,14 +25,22 @@ trait HasWordEmbeddings extends AutoCloseable with ParamsAndFeaturesWritable { def setDims(nDims: Int): this.type = set(this.nDims, nDims) def setIndexPath(path: String): this.type = set(this.indexPath, path) - lazy val embeddings: Option[WordEmbeddings] = get(indexPath).map { path => - // Have to copy file because RockDB changes it and Spark rises Exception - val src = SparkFiles.get(path) - val workPath = src + "_work" - if (!new File(workPath).exists()) - FileUtil.deepCopy(new File(src), new File(workPath), null, false) - - WordEmbeddings(workPath, $(nDims)) + @transient + var wembeddings: Option[WordEmbeddings] = None + + def embeddings(): Option[WordEmbeddings] = { + if (wembeddings == null || wembeddings.isEmpty) { + wembeddings = get(indexPath).map { path => + // Have to copy file because RockDB changes it and Spark rises Exception + val src = SparkFiles.get(path) + val workPath = src + "_work" + if (!new File(workPath).exists()) + FileUtil.deepCopy(new File(src), new File(workPath), null, false) + + WordEmbeddings(workPath, $(nDims)) + } + } + wembeddings } override def close(): Unit = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index c665532533ccd9..4b7d2bc3897218 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -1,11 +1,11 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg -import com.johnsnowlabs.nlp.AnnotatorApproach +import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.AnnotatorType._ -import com.johnsnowlabs.nlp.embeddings.{AnnotatorWithWordEmbeddings, WordEmbeddings} +import com.johnsnowlabs.nlp.embeddings.{ApproachWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.param.{DoubleParam, IntParam, Param} +import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} +import org.apache.spark.ml.param.{DoubleParam, IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ @@ -15,13 +15,14 @@ import scala.collection.mutable /** * Created by jose on 22/11/17. */ -class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproach[AssertionLogRegModel] with - AnnotatorWithWordEmbeddings[AssertionLogRegApproach, AssertionLogRegModel] with Windowing { +class AssertionLogRegApproach(val uid: String) + extends ApproachWithWordEmbeddings[AssertionLogRegApproach, AssertionLogRegModel] with Windowing { + //AnnotatorApproach[AssertionLogRegModel] override val requiredAnnotatorTypes = Array(DOCUMENT) - override val description: String = "Clinical Text Status Assertion" + val description: String = "Clinical Text Status Assertion" override val tokenizer: Tokenizer = new SimpleTokenizer - override lazy val wordVectors: Option[WordEmbeddings] = embeddings + override def wordVectors: Option[WordEmbeddings] = embeddings lazy override val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) @@ -110,4 +111,6 @@ class AssertionLogRegApproach(override val uid: String) extends AnnotatorApproac } private def labelToNumber(mappings: Map[String, Double]) = udf { label:String => mappings.get(label)} + + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index ec0e886bf56560..7e0f27023f0012 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -1,27 +1,34 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT} -import com.johnsnowlabs.nlp.{Annotation, DatasetAnnotatorModel} -import com.johnsnowlabs.nlp.embeddings.{ModelWithWordEmbeddings, WordEmbeddings} +import com.johnsnowlabs.nlp.{HasOutputAnnotationCol, _} +import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.hadoop.fs.Path -import org.apache.spark.ml.param.{IntParam, Param} +import org.apache.spark.ml.Model +import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ import scala.collection.immutable.Map import scala.collection.mutable - /** * Created by jose on 22/11/17. */ -class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) - extends DatasetAnnotatorModel[AssertionLogRegModel] with ModelWithWordEmbeddings[AssertionLogRegModel] - with Windowing { +class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends Model[AssertionLogRegModel] + with ParamsAndFeaturesWritable + with HasAnnotatorType + with HasInputAnnotationCols + with HasOutputAnnotationCol + with HasWordEmbeddings + with Windowing + with Serializable + with TransformModelSchema { override val tokenizer: Tokenizer = new SimpleTokenizer override val annotatorType: AnnotatorType = ASSERTION @@ -51,7 +58,6 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS def setEnd(end: String) = set(endParam, end) def setTargetCol(target: String) = set(target, target) - override final def transform(dataset: Dataset[_]): DataFrame = { require(validate(dataset.schema), s"Missing annotators in pipeline. Make sure the following are present: " + s"${requiredAnnotatorTypes.mkString(", ")}") @@ -83,8 +89,6 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS Seq(annotation) } - override protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations - var model: Option[LogisticRegressionModel] = None var labelMap: Option[Map[Double, String]] = None @@ -104,13 +108,15 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS def extractTextUdf = udf { document:mutable.WrappedArray[GenericRowWithSchema] => document.head.getString(3) } + + /** requirement for annotators copies */ + override def copy(extra: ParamMap): AssertionLogRegModel = defaultCopy(extra) } object AssertionLogRegModel extends DefaultParamsReadable[AssertionLogRegModel] { def apply(): AssertionLogRegModel = new AssertionLogRegModel() override def read: MLReader[AssertionLogRegModel] = new AssertionModelReader(super.read) - class AssertionModelReader(baseReader: MLReader[AssertionLogRegModel]) extends MLReader[AssertionLogRegModel] { override def load(path: String): AssertionLogRegModel = { val instance = baseReader.load(path) @@ -155,8 +161,4 @@ object AssertionLogRegModel extends DefaultParamsReadable[AssertionLogRegModel] model.serializeEmbeddings(path, sparkSession.sparkContext) } } - } - - - diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 14bc00b285d208..7f2f1ee22d24c2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -18,7 +18,7 @@ trait Windowing extends Serializable { val tokenizer : Tokenizer - lazy val wordVectors: Option[WordEmbeddings] = None + def wordVectors(): Option[WordEmbeddings] = None /* apply window, pad/truncate sentence according to window */ def applyWindow(doc: String, s: Int, e: Int): (Array[String], Array[String], Array[String]) = { @@ -86,7 +86,7 @@ trait Windowing extends Serializable { def applyWindowUdf = //here 's' and 'e' are token numbers for start and end of target when split on " " udf { (doc:String, targetTerm:String, s:Int, e:Int) => - Vectors.dense(applyWindow(wordVectors.get)(doc, targetTerm, s, e)) + Vectors.dense(applyWindow(wordVectors.get)(doc, targetTerm, s, e)) } def l2norm(xs: Array[Double]):Double = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala index 80abbf63f69b45..7a98db2c5597f5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala @@ -46,7 +46,7 @@ abstract class ApproachWithWordEmbeddings[A <: ApproachWithWordEmbeddings[A, M], // 3. Copy WordEmbeddings to cluster WordEmbeddingsClusterHelper.copyIndexToCluster(localPath.get, spark.sparkContext) // 4. Create Embeddings for usage during train - embeddings = Some(WordEmbeddings(localPath.get, $(embeddingsNDims))) + wembeddings = Some(WordEmbeddings(localPath.get, $(embeddingsNDims))) } } @@ -59,7 +59,14 @@ abstract class ApproachWithWordEmbeddings[A <: ApproachWithWordEmbeddings[A, M], } } - var embeddings: Option[WordEmbeddings] = None + @transient + var wembeddings: Option[WordEmbeddings] = None + + def embeddings(): Option[WordEmbeddings] = { + if (wembeddings == null || wembeddings.isEmpty) + wembeddings = Some(WordEmbeddings(localPath.get, $(embeddingsNDims))) + wembeddings + } private var localPath: Option[String] = None private def indexEmbeddings(localFile: String, spark: SparkContext): Unit = { From 7ce496c351e0f6429bff64ce42be10f40c505020 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 25 Jan 2018 14:55:45 -0300 Subject: [PATCH 46/55] missing file --- .../example/logreg-assertion/assertion.ipynb | 4 +-- python/sparknlp/annotator.py | 12 ++++++++- .../johnsnowlabs/nlp/TransformSchema.scala | 25 +++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/nlp/TransformSchema.scala diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb index 3cf8d61f8d8e18..982df84b546f68 100644 --- a/python/example/logreg-assertion/assertion.ipynb +++ b/python/example/logreg-assertion/assertion.ipynb @@ -210,7 +210,7 @@ "outputs": [ { "ename": "Py4JJavaError", - "evalue": "An error occurred while calling o208.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", + "evalue": "An error occurred while calling o220.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", @@ -220,7 +220,7 @@ "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1131\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1132\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1133\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1134\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/sql/utils.pyc\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/protocol.pyc\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 317\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m raise Py4JError(\n", - "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o208.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" + "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o220.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" ] } ], diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index c42a667bdbe0b7..ec41ae65c729e3 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -661,6 +661,14 @@ def setAfter(self, after): self._set(afterParam = after) return self + def setStart(self, s): + self._set(startParam = s) + return self + + def setEnd(self, e): + self._set(endParam = e) + return self + def _create_model(self, java_model): return AssertionLogRegModel(java_model) @@ -675,7 +683,9 @@ def __init__(self): regParam = 0.00192, eNetParam = 0.9, beforeParam = 10, - afterParam = 10) + afterParam = 10, + startParam = "start", + endParam = "end") class AssertionLogRegModel(JavaModel, JavaMLWritable, JavaMLReadable, AnnotatorProperties): diff --git a/src/main/scala/com/johnsnowlabs/nlp/TransformSchema.scala b/src/main/scala/com/johnsnowlabs/nlp/TransformSchema.scala new file mode 100644 index 00000000000000..bc69795933ab95 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/TransformSchema.scala @@ -0,0 +1,25 @@ +package com.johnsnowlabs.nlp + +import org.apache.spark.ml.Model +import org.apache.spark.sql.types._ + +/** + * Created by jose on 25/01/18. + */ +trait TransformModelSchema { + + this: Model[_] with HasOutputAnnotationCol with HasAnnotatorType => + + /** Shape of annotations at output */ + private def outputDataType: DataType = ArrayType(Annotation.dataType) + + /** requirement for pipeline transformation validation. It is called on fit() */ + override final def transformSchema(schema: StructType): StructType = { + val metadataBuilder: MetadataBuilder = new MetadataBuilder() + metadataBuilder.putString("annotatorType", annotatorType) + val outputFields = schema.fields :+ + StructField(getOutputCol, outputDataType, nullable = false, metadataBuilder.build) + StructType(outputFields) + } + +} From 35eb023c15307c8df7796914876956e930e587a2 Mon Sep 17 00:00:00 2001 From: Alberto Date: Thu, 25 Jan 2018 15:58:23 -0300 Subject: [PATCH 47/55] fixes for serialization --- docs/components.html | 2 +- .../example/logreg-assertion/assertion.ipynb | 27 ++----------------- python/sparknlp/__init__.py | 1 + python/sparknlp/annotator.py | 1 + .../logreg/AssertionLogRegApproach.scala | 5 ++-- 5 files changed, 8 insertions(+), 28 deletions(-) diff --git a/docs/components.html b/docs/components.html index be9da8fa1e230e..0e1b70c89c1def 100644 --- a/docs/components.html +++ b/docs/components.html @@ -1104,7 +1104,7 @@

15. AssertionStatus: Assertion S .setOutputCol("assertion") \ .setBefore(11) \ .setAfter(13) \ - .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary) + .setEmbeddingsSource(embeddingsFile, 200, 3)
diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb index 982df84b546f68..e57ff3bb38b04e 100644 --- a/python/example/logreg-assertion/assertion.ipynb +++ b/python/example/logreg-assertion/assertion.ipynb @@ -203,40 +203,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "scrolled": false }, - "outputs": [ - { - "ename": "Py4JJavaError", - "evalue": "An error occurred while calling o220.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPipelineModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPipeline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mPipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./assertion_pipeline\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msameModel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPipelineModel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./assertion_model\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/ml/util.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"path should be a basestring, got type %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m \u001b[0mjava_obj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 198\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_clazz\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"_from_java\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m raise NotImplementedError(\"This Java ML type cannot be loaded into Python currently: %r\"\n", - "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1131\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1132\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1133\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1134\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pyspark/sql/utils.pyc\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/py4j/protocol.pyc\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 317\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m raise Py4JError(\n", - "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o220.load.\n: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()\n\tat java.lang.Class.getMethod(Class.java:1786)\n\tat org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)\n\tat scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)\n\tat org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)\n\tat org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" - ] - } - ], + "outputs": [], "source": [ "from pyspark.ml import PipelineModel, Pipeline\n", "\n", "Pipeline.read().load(\"./assertion_pipeline\")\n", "sameModel = PipelineModel.read().load(\"./assertion_model\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 55c01748211431..95a46d7586ad41 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -6,6 +6,7 @@ sys.modules['com.johnsnowlabs.nlp.annotators.ner'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.ner.regex'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.ner.crf'] = annotator +sys.modules['com.johnsnowlabs.nlp.annotators.assertion'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.assertion.logreg'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.pos'] = annotator sys.modules['com.johnsnowlabs.nlp.annotators.pos.perceptron'] = annotator diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index ec41ae65c729e3..bd5f0b00de36fa 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -18,6 +18,7 @@ ner = sys.modules[__name__] crf = sys.modules[__name__] assertion = sys.modules[__name__] +logreg = sys.modules[__name__] regex = sys.modules[__name__] sbd = sys.modules[__name__] sda = sys.modules[__name__] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index 4b7d2bc3897218..a22312def0b8bb 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -1,10 +1,9 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg -import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.{ApproachWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.ml.param.{DoubleParam, IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.Dataset @@ -114,3 +113,5 @@ class AssertionLogRegApproach(val uid: String) } + +object AssertionLogRegApproach extends DefaultParamsReadable[AssertionLogRegApproach] From a5132af1e16ce525bc824625f458db0d5a422471 Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 26 Jan 2018 10:36:12 -0300 Subject: [PATCH 48/55] transient lazy pattern --- .../johnsnowlabs/nlp/BaseAnnotatorModel.scala | 42 ------------------- .../nlp/DatasetAnnotatorModel.scala | 11 ----- .../johnsnowlabs/nlp/HasWordEmbeddings.scala | 24 ++++------- .../com/johnsnowlabs/nlp/RawAnnotator.scala | 14 +++++++ .../logreg/AssertionLogRegApproach.scala | 1 - .../logreg/AssertionLogRegModel.scala | 11 +---- 6 files changed, 24 insertions(+), 79 deletions(-) delete mode 100644 src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala delete mode 100644 src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala deleted file mode 100644 index d76efe69c1e404..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/BaseAnnotatorModel.scala +++ /dev/null @@ -1,42 +0,0 @@ -package com.johnsnowlabs.nlp - -import org.apache.spark.ml.Model -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.util.DefaultParamsWritable -import org.apache.spark.sql.types._ - -/** - * This trait implements logic that applies nlp using Spark ML Pipeline transformers - * Should strongly change once UsedDefinedTypes are allowed - * https://issues.apache.org/jira/browse/SPARK-7768 - */ -abstract class BaseAnnotatorModel[M <: Model[M]] - extends Model[M] - with DefaultParamsWritable - with HasAnnotatorType - with HasInputAnnotationCols - with HasOutputAnnotationCol { - - /** - * takes a document and annotations and produces new annotations of this annotator's annotation type - * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any - * @return any number of annotations processed for every input annotation. Not necessary one to one relationship - */ - protected def annotate(annotations: Seq[Annotation]): Seq[Annotation] - - /** Shape of annotations at output */ - private def outputDataType: DataType = ArrayType(Annotation.dataType) - - /** requirement for pipeline transformation validation. It is called on fit() */ - override final def transformSchema(schema: StructType): StructType = { - val metadataBuilder: MetadataBuilder = new MetadataBuilder() - metadataBuilder.putString("annotatorType", annotatorType) - val outputFields = schema.fields :+ - StructField(getOutputCol, outputDataType, nullable = false, metadataBuilder.build) - StructType(outputFields) - } - - /** requirement for annotators copies */ - override def copy(extra: ParamMap): M = defaultCopy(extra) - -} \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala deleted file mode 100644 index 04f3e86a184dce..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/DatasetAnnotatorModel.scala +++ /dev/null @@ -1,11 +0,0 @@ -package com.johnsnowlabs.nlp - -import org.apache.spark.ml.Model - -/** - * Created by jose on 21/01/18. - * This class allows for model evaluation happening on distributed Spark collections - */ -trait DatasetAnnotatorModel[M <: Model[M]] extends BaseAnnotatorModel[M] { - -} diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala index 6bd46f3f3b8a5e..55fd4b6f9451e8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala @@ -25,22 +25,14 @@ trait HasWordEmbeddings extends AutoCloseable with ParamsAndFeaturesWritable { def setDims(nDims: Int): this.type = set(this.nDims, nDims) def setIndexPath(path: String): this.type = set(this.indexPath, path) - @transient - var wembeddings: Option[WordEmbeddings] = None - - def embeddings(): Option[WordEmbeddings] = { - if (wembeddings == null || wembeddings.isEmpty) { - wembeddings = get(indexPath).map { path => - // Have to copy file because RockDB changes it and Spark rises Exception - val src = SparkFiles.get(path) - val workPath = src + "_work" - if (!new File(workPath).exists()) - FileUtil.deepCopy(new File(src), new File(workPath), null, false) - - WordEmbeddings(workPath, $(nDims)) - } - } - wembeddings + @transient lazy val embeddings: Option[WordEmbeddings] = get(indexPath).map { path => + // Have to copy file because RockDB changes it and Spark rises Exception + val src = SparkFiles.get(path) + val workPath = src + "_work" + if (!new File(workPath).exists()) + FileUtil.deepCopy(new File(src), new File(workPath), null, false) + + WordEmbeddings(workPath, $(nDims)) } override def close(): Unit = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala b/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala new file mode 100644 index 00000000000000..6e43351d34b3bb --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala @@ -0,0 +1,14 @@ +package com.johnsnowlabs.nlp + +import org.apache.spark.ml.Model + +/** + * Created by jose on 25/01/18. + */ +abstract class RawAnnotator[M<:Model[M]] extends Model[M] + with ParamsAndFeaturesWritable + with HasAnnotatorType + with HasInputAnnotationCols + with HasOutputAnnotationCol + with HasWordEmbeddings { +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index a22312def0b8bb..fc07f9637ef9fc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -17,7 +17,6 @@ import scala.collection.mutable class AssertionLogRegApproach(val uid: String) extends ApproachWithWordEmbeddings[AssertionLogRegApproach, AssertionLogRegModel] with Windowing { - //AnnotatorApproach[AssertionLogRegModel] override val requiredAnnotatorTypes = Array(DOCUMENT) val description: String = "Clinical Text Status Assertion" override val tokenizer: Tokenizer = new SimpleTokenizer diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 7e0f27023f0012..87f94350a83e72 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -20,15 +20,8 @@ import scala.collection.mutable * Created by jose on 22/11/17. */ -class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends Model[AssertionLogRegModel] - with ParamsAndFeaturesWritable - with HasAnnotatorType - with HasInputAnnotationCols - with HasOutputAnnotationCol - with HasWordEmbeddings - with Windowing - with Serializable - with TransformModelSchema { +class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends RawAnnotator[AssertionLogRegModel] + with Windowing with Serializable with TransformModelSchema { override val tokenizer: Tokenizer = new SimpleTokenizer override val annotatorType: AnnotatorType = ASSERTION From a4476069ea4a2a964db127bb0e19b851f5bc36a0 Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 26 Jan 2018 12:08:50 -0300 Subject: [PATCH 49/55] cleanup --- .../com/johnsnowlabs/nlp/AnnotatorModel.scala | 4 +- .../logreg/AssertionLogRegApproach.scala | 6 +- .../logreg/AssertionLogRegModel.scala | 5 +- .../assertion/logreg/RegexTokenizer.scala | 29 ----- .../assertion/logreg/Tokenizer.scala | 2 - .../assertion/logreg/Windowing.scala | 11 -- .../ApproachWithWordEmbeddings.scala | 3 +- .../nlp/embeddings/WordEmbeddings.scala | 2 - .../johnsnowlabs/ml/logreg/I2b2Reader.scala | 119 ------------------ .../ml/logreg/NegexDatasetLogRegTest.scala | 13 +- .../ml/logreg/NegexDatasetPipelineTest.scala | 12 +- .../ml/logreg/NegexDatasetReader.scala | 5 +- 12 files changed, 27 insertions(+), 184 deletions(-) delete mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala delete mode 100644 src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 21c9841d7e77de..d93baec82586ed 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -9,7 +9,9 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.MetadataBuilder /** - * Created by jose on 21/01/18. + * This trait implements logic that applies nlp using Spark ML Pipeline transformers + * Should strongly change once UsedDefinedTypes are allowed + * https://issues.apache.org/jira/browse/SPARK-7768 */ abstract class AnnotatorModel[M <: Model[M]] extends Model[M] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index fc07f9637ef9fc..c92d1f0333ffd4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -4,7 +4,7 @@ import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.embeddings.{ApproachWithWordEmbeddings, WordEmbeddings} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} -import org.apache.spark.ml.param.{DoubleParam, IntParam, Param, ParamMap} +import org.apache.spark.ml.param.{DoubleParam, IntParam, Param} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ @@ -29,8 +29,6 @@ class AssertionLogRegApproach(val uid: String) // example of possible values, 'Negated', 'Affirmed', 'Historical' val label = new Param[String](this, "label", "Column with one label per document") - // the document where we're extracting the assertion - val document = new Param[String](this, "document", "Column with the text to be analyzed") // the target term, that must appear capitalized in the document, e.g., 'diabetes' val target = new Param[String](this, "target", "Column with the target to analyze") val maxIter = new IntParam(this, "maxIter", "Max number of iterations for algorithm") @@ -55,7 +53,6 @@ class AssertionLogRegApproach(val uid: String) def setEnd(end: String) = set(endParam, end) setDefault(label -> "label", - document -> "document", target -> "target", maxIter -> 26, regParam -> 0.00192, @@ -110,7 +107,6 @@ class AssertionLogRegApproach(val uid: String) private def labelToNumber(mappings: Map[String, Double]) = udf { label:String => mappings.get(label)} - } object AssertionLogRegApproach extends DefaultParamsReadable[AssertionLogRegApproach] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 87f94350a83e72..4dd86614b2cdf1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -1,17 +1,16 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT} -import com.johnsnowlabs.nlp.{HasOutputAnnotationCol, _} +import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.hadoop.fs.Path -import org.apache.spark.ml.Model import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ + import scala.collection.immutable.Map import scala.collection.mutable diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala deleted file mode 100644 index b49765e59d8ef8..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/RegexTokenizer.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.johnsnowlabs.nlp.annotators.assertion.logreg - -/** - * Created by jose on 18/12/17. - */ -class RegexTokenizer extends Tokenizer{ - - /* these match the behavior we had when tokenizing sentences for word embeddings */ - val punctuation = Seq(".", ":", ";", ",", "?", "!", "+", "-", "_", "(", ")", "{", - "}", "#", "mg/kg", "ml", "m2", "cm", "/", "\\", "\"", "'", "[", "]", "%", "<", ">", "&", "=") - - val percent_regex = """([0-9]{1,2}\.[0-9]{1,2}%|[0-9]{1,3}%)""".r - val number_regex = """([0-9]{1,6})""".r - - override def tokenize(sent: String): Array[String] = { - // replace percentage - var tmp = percent_regex.replaceAllIn(sent, " percentnum ") - - // unbind special chars - for (c <- punctuation) { - tmp = tmp.replaceAllLiterally(c, " " + c + " ") - } - - // replace any num - val result = number_regex.replaceAllIn(tmp, " digitnum ").toLowerCase.split(" ").filter(_!="") - result - } - -} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala index 38d62dc807ecdd..4167f2dcaa7c14 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Tokenizer.scala @@ -7,6 +7,4 @@ trait Tokenizer extends Serializable { def tokenize(sent: String) : Array[String] - - } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala index 7f2f1ee22d24c2..5416bc62124d62 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/Windowing.scala @@ -56,7 +56,6 @@ trait Windowing extends Serializable { applyWindow(doc, start, end) } - /* TODO targetTerm never used */ def applyWindow(wvectors: WordEmbeddings) (doc:String, targetTerm:String, s:Int, e:Int) : Array[Double] = { val tokens = doc.split(" ").filter(_!="") @@ -73,16 +72,6 @@ trait Windowing extends Serializable { r.flatMap(w => normalize(wvectors.getEmbeddings(w).map(_.toDouble))) } - def applyWindowUdf(wvectors: WordEmbeddings, codes: Map[String, Array[Double]]) = - udf {(doc:String, pos:mutable.WrappedArray[GenericRowWithSchema], start:Int, end:Int, targetTerm:String) => - val (l, t, r) = applyWindow(doc.toLowerCase, targetTerm.toLowerCase) - var target = Array(0.1, -0.1) - var nonTarget = Array(-0.1, 0.1) - l.flatMap(w => wvectors.getEmbeddings(w)).map(_.toDouble) ++ - t.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) ++ - r.flatMap(w => wvectors.getEmbeddings(w).map(_.toDouble) ).map(_.toDouble) - } - def applyWindowUdf = //here 's' and 'e' are token numbers for start and end of target when split on " " udf { (doc:String, targetTerm:String, s:Int, e:Int) => diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala index 7a98db2c5597f5..76123a845baf89 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala @@ -3,8 +3,7 @@ package com.johnsnowlabs.nlp.embeddings import java.io.File import java.nio.file.Files import java.util.UUID - -import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorModel, HasWordEmbeddings} +import com.johnsnowlabs.nlp.{AnnotatorApproach, HasWordEmbeddings} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext import org.apache.spark.ml.Model diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala index bdd6aa9aae9cd0..4a869b3c659ddc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala @@ -28,8 +28,6 @@ case class WordEmbeddings(dbFile: String, } def getEmbeddings(word: String): Array[Float] = { - if(word.contains(" ")) - println("ERROR") lru.getOrElseUpdate(word, getEmbeddingsFromDb(word)) } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala deleted file mode 100644 index eff8351689658e..00000000000000 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/I2b2Reader.scala +++ /dev/null @@ -1,119 +0,0 @@ -package com.johnsnowlabs.ml.logreg - -import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsIndexer} -import org.apache.spark.sql.functions.udf -import org.apache.spark.sql.{DataFrame, SparkSession} -import java.io.File -import scala.io.Source - -/** - * Reader for the i2b2 dataset - * -*/ - -class I2b2DatasetReader(wordEmbeddingsFile: String, targetLengthLimit:Int) extends Serializable { - - var fileDb = wordEmbeddingsFile + ".db" - - /* receives the location of a single dataset (e.g. 'beth'), - * and returns a sequence of datapoins I2b2AnnotationAndText - * */ - private def read(path: String): Seq[I2b2AnnotationAndText] = { - - // read list of ast files, without extension - val astFileNames = { - val ast = new File(s"$path/ast/") - if (ast.exists && ast.isDirectory) - ast.listFiles.filter(_.isFile).toList.map(_.getName.dropRight(4)) - else - List[String]() - } - - var tooLong = 0 - - // extract datapoints from each file - val datapoints = for {name <- astFileNames - annotation <- Source.fromFile(s"$path/ast/$name.ast").getLines() - sourceTxt = Source.fromFile(s"$path/txt/$name.txt").getLines().toList - } yield { - val record = I2b2Annotation(annotation) - val text = sourceTxt(record.sourceLine - 1) - if(record.target.split(" ").length > targetLengthLimit){ - tooLong += 1 - null - } - else - I2b2AnnotationAndText(text, record.target, record.label, record.start, record.end) - } - println("number of targets too long: " + tooLong) - datapoints - } - - /* reads all the locations for all datasets (e.g. ['beth', 'partners']), - * and returns a Spark DataFrame - * */ - def readDataFrame(datasetPaths: Seq[String]) (implicit session: SparkSession): DataFrame= { - import session.implicits._ - datasetPaths.flatMap(read).filter(_!=null).toDF //.withColumn("label", labelToNumber($"label")) - } - - - lazy val wordVectors: Option[WordEmbeddings] = Option(wordEmbeddingsFile).map { - wordEmbeddingsFile => - require(new File(wordEmbeddingsFile).exists()) - val fileDb = wordEmbeddingsFile + ".db" - if (!new File(fileDb).exists()) - WordEmbeddingsIndexer.indexBinary(wordEmbeddingsFile, fileDb) - }.filter(_ => new File(fileDb).exists()) - .map(_ => WordEmbeddings(fileDb, 200)) - - -} -case class I2b2Annotation(target: String, label: String, start:Int, end:Int, sourceLine:Int) -case class I2b2AnnotationAndText(text: String, target: String, label: String, start:Int, end:Int) - -object I2b2Annotation { - - private def extractTarget(text:String): String = { - val pattern = "c=\"(.*)\"".r - pattern.findFirstMatchIn(text).map(_.group(1)). - getOrElse(throw new RuntimeException("Broken dataset - bad target")) - } - - private def extractSourceLine(text: String): Int = { - val pattern = "c=\".*\" (\\d+):\\d+".r - pattern.findFirstMatchIn(text).map(_.group(1)). - getOrElse(throw new RuntimeException("Broken dataset - bad source line")).toInt - } - - def extractLimits(text: String): (Int, Int) = { - val startPattern = "\\d+:(\\d+)\\s\\d+:\\d+".r - val endPattern = "\\d+:\\d+\\s\\d+:(\\d+)".r - - val start = startPattern.findAllMatchIn(text).map(_.group(1)).toList match { - case s::Nil => s.toInt - case _ => throw new RuntimeException("Broken dataset - bad start") - } - - val end = endPattern.findAllMatchIn(text).map(_.group(1)).toList match { - case e::Nil => e.toInt - case _ => throw new RuntimeException("Broken dataset - bad end") - } - (start, end) - } - - def extractLabel(text: String) = { - val pattern = "a=\"(.*)\"".r - pattern.findFirstMatchIn(text).map(_.group(1)). - getOrElse(throw new RuntimeException("Broken dataset - bad source line")) - } - - def apply(annotation: String): I2b2Annotation = { - val chunks = annotation.split("\\|\\|") - val target = extractTarget(chunks(0)) - val sourceLine = extractSourceLine(chunks(0)) - val (start, end) = extractLimits(chunks(0)) - val label = extractLabel(chunks(2)) - I2b2Annotation(target, label, start, end, sourceLine) - } -} diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala index 49468d51b8b34b..0521e4b6874abc 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetLogRegTest.scala @@ -10,8 +10,10 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, SparkSession} /** - * Test on simple dataset from NegEx - * Created by jose on 22/01/18. + * Test on simple dataset from NegEx, can be obtained from, + * https://raw.githubusercontent.com/mongoose54/negex/master/genConText/rsAnnotations-1-120-random.txt + * Word Embeddings can be obtained from, + * https://github.com/cambridgeltl/BioNLP-2016 */ object NegexDatasetLogRegTest extends App with Windowing with EvaluationMetrics { @@ -20,12 +22,15 @@ object NegexDatasetLogRegTest extends App with Windowing with EvaluationMetrics override val tokenizer: Tokenizer = new SimpleTokenizer /* local Spark for test */ - implicit val spark = SparkSession.builder().appName("DataFrame-UDF").master("local[4]").getOrCreate() + implicit val spark = SparkSession.builder(). + appName("Simple Assertion Status text on Negex dataset"). + master("local[4]").getOrCreate() + import spark.implicits._ val datasetPath = "rsAnnotations-1-120-random.txt" val embeddingsDims = 200 - val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val embeddingsFile = s"PubMed-shuffle-win-2.bin" val fileDb = embeddingsFile + ".db" override lazy val wordVectors: Option[WordEmbeddings] = Option(embeddingsFile).map { diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala index 49e78c759da9fa..23b2bcef1c2349 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetPipelineTest.scala @@ -7,6 +7,14 @@ import com.johnsnowlabs.nlp.{Annotation, DocumentAssembler} import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.{DataFrame, Row, SparkSession} +/* +* Test Assertion Status on the Pipeline +* dataset from NegEx, can be obtained from, +* https://raw.githubusercontent.com/mongoose54/negex/master/genConText/rsAnnotations-1-120-random.txt +* Word Embeddings can be obtained from, +* https://github.com/cambridgeltl/BioNLP-2016 +* */ + object NegexDatasetPipelineTest extends App with EvaluationMetrics { implicit val spark = SparkSession.builder().appName("i2b2 logreg").master("local[1]").getOrCreate @@ -16,7 +24,7 @@ object NegexDatasetPipelineTest extends App with EvaluationMetrics { val i2b2Dir = "/home/jose/Downloads/i2b2" // word embeddings location - val embeddingsFile = s"/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin" + val embeddingsFile = s"PubMed-shuffle-win-2.bin" val datasetPath = "rsAnnotations-1-120-random.txt" val embeddingsDims = 200 @@ -69,7 +77,6 @@ object NegexDatasetPipelineTest extends App with EvaluationMetrics { } def trainAssertionModel(dataset: DataFrame): PipelineModel = { - System.out.println("Start fitting") // train Assertion Status @@ -83,5 +90,4 @@ object NegexDatasetPipelineTest extends App with EvaluationMetrics { System.out.println("Test Dataset Reading") model.transform(dataset) } - } diff --git a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala index 5fbe7bebf46645..c766b1c0dfabe6 100644 --- a/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala +++ b/src/test/scala/com/johnsnowlabs/ml/logreg/NegexDatasetReader.scala @@ -22,7 +22,7 @@ class NegexDatasetReader(targetLengthLimit: Int = 10) extends Serializable { indexOfSlice(targetTokens) val lastTargetIdx = firstTargetIdx + targetTokens.size - 1 - if( lastTargetIdx < 0 || firstTargetIdx <0) + if( lastTargetIdx < 0 || firstTargetIdx < 0) print(sentence) (firstTargetIdx, lastTargetIdx) } @@ -51,7 +51,7 @@ class NegexDatasetReader(targetLengthLimit: Int = 10) extends Serializable { } .map{ line => val chunks = line.split("\t") - // keep single spaces + // keep single spaces only val doc = chunks(3).split(" ").map(_.trim).filter(_!="").mkString(" ") val (s, e) = getTargetIndices(doc, chunks(2)) Datapoint(doc.map(_.toLower), @@ -62,7 +62,6 @@ class NegexDatasetReader(targetLengthLimit: Int = 10) extends Serializable { dataframe } - } case class Datapoint(sentence: String, target: String, label: String, start:Int, end:Int) From bf944d0aea39fd9d997d9c675c680b6b5ffab3e3 Mon Sep 17 00:00:00 2001 From: Alberto Date: Fri, 26 Jan 2018 12:25:32 -0300 Subject: [PATCH 50/55] removed embeddings logic from RawAnnotator --- src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala | 3 +-- .../nlp/annotators/assertion/logreg/AssertionLogRegModel.scala | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala b/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala index 6e43351d34b3bb..b8df5797532fd2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/RawAnnotator.scala @@ -9,6 +9,5 @@ abstract class RawAnnotator[M<:Model[M]] extends Model[M] with ParamsAndFeaturesWritable with HasAnnotatorType with HasInputAnnotationCols - with HasOutputAnnotationCol - with HasWordEmbeddings { + with HasOutputAnnotationCol { } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 4dd86614b2cdf1..29f8a963906e2e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -20,7 +20,7 @@ import scala.collection.mutable */ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends RawAnnotator[AssertionLogRegModel] - with Windowing with Serializable with TransformModelSchema { + with Windowing with Serializable with TransformModelSchema with HasWordEmbeddings { override val tokenizer: Tokenizer = new SimpleTokenizer override val annotatorType: AnnotatorType = ASSERTION From 8e364e0910a321ab58b91cc61609cc4af26f2cb1 Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Sat, 27 Jan 2018 12:37:34 -0300 Subject: [PATCH 51/55] - New tokenizer wrap up --- .../johnsnowlabs/nlp/HasWordEmbeddings.scala | 2 +- .../nlp/ParamsAndFeaturesWritable.scala | 4 +- .../logreg/AssertionLogRegModel.scala | 37 +++++-------------- .../annotators/common/IntStringMapParam.scala | 31 ---------------- .../annotators/common/StringMapParam.scala | 30 --------------- 5 files changed, 12 insertions(+), 92 deletions(-) delete mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/common/IntStringMapParam.scala delete mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/common/StringMapParam.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala index 55fd4b6f9451e8..270ed4afcd1e8d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala @@ -82,7 +82,7 @@ trait HasWordEmbeddings extends AutoCloseable with ParamsAndFeaturesWritable { def getEmbeddingsSerializedPath(path: String): Path = Path.mergePaths(new Path(path), new Path("/embeddings")) - override def onWritten(path: String, spark: SparkSession): Unit = { + override def onWrite(path: String, spark: SparkSession): Unit = { deserializeEmbeddings(path, spark.sparkContext) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala index aac623b487c02d..760ced1513b419 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/ParamsAndFeaturesWritable.scala @@ -21,12 +21,12 @@ class FeaturesWriter[T](annotatorWithFeatures: HasFeatures, baseWriter: MLWriter trait ParamsAndFeaturesWritable extends DefaultParamsWritable with Params with HasFeatures { - def onWritten(path: String, spark: SparkSession): Unit = {} + def onWrite(path: String, spark: SparkSession): Unit = {} override def write: MLWriter = new FeaturesWriter( this, super.write, - (path: String, spark: SparkSession) => onWritten(path, spark) + (path: String, spark: SparkSession) => onWrite(path, spark) ) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 29f8a963906e2e..c02664cdd737d2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -3,6 +3,7 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.embeddings.WordEmbeddings +import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} @@ -11,7 +12,6 @@ import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions._ - import scala.collection.immutable.Map import scala.collection.mutable @@ -37,6 +37,9 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS val startParam = new Param[String](this, "startParam", "Column that contains the token number for the start of the target") val endParam = new Param[String](this, "endParam", "Column that contains the token number for the end of the target") + var model: Param[LogisticRegressionModel] = new Param[LogisticRegressionModel](this, "logistic regression", "trained lr for prediction") + var labelMap: MapFeature[Double, String] = new MapFeature[Double, String](this, "labels") + override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) setDefault( @@ -55,7 +58,6 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS s"${requiredAnnotatorTypes.mkString(", ")}") import dataset.sqlContext.implicits._ - require(model.isDefined, "model must be set before tagging") /* apply UDF to fix the length of each document */ val processed = dataset.toDF. @@ -65,10 +67,10 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS col(getOrDefault(startParam)), col(getOrDefault(endParam)))) - model.get.transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) + $(model).transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) } - private def packAnnotations = udf { (text: String, target: String, s: Int, e: Int, prediction: Double) => + private def packAnnotations = udf { (text: String, s: Int, e: Int, prediction: Double) => val tokens = text.split(" ").filter(_!="") /* convert start and end are indexes in the doc string */ @@ -77,24 +79,13 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS val end = start + tokens.slice(s, e + 1).map(_.length).sum + tokens.slice(s, e + 1).size - 2 // account for spaces - val annotation = Annotation("assertion", start, end, labelMap.get(prediction), Map()) + val annotation = Annotation("assertion", start, end, $$(labelMap)(prediction), Map()) Seq(annotation) } - var model: Option[LogisticRegressionModel] = None - var labelMap: Option[Map[Double, String]] = None + def setModel(m: LogisticRegressionModel): this.type = set(model, m) - def setModel(m: LogisticRegressionModel): AssertionLogRegModel = { - model = Some(m) - this - } - - def setLabelMap(labelMappings: Map[String, Double]) = { - labelMap = Some(labelMappings.map(_.swap)) - this - } - - override def write: MLWriter = new AssertionLogRegModel.AssertionModelWriter(this, super.write) + def setLabelMap(labelMappings: Map[String, Double]): this.type = set(labelMap, labelMappings.map(_.swap)) /* send this to common place */ def extractTextUdf = udf { document:mutable.WrappedArray[GenericRowWithSchema] => @@ -139,16 +130,6 @@ object AssertionLogRegModel extends DefaultParamsReadable[AssertionLogRegModel] class AssertionModelWriter(model: AssertionLogRegModel, baseWriter: MLWriter) extends MLWriter { override protected def saveImpl(path: String): Unit = { - require(model.model.isDefined, "Assertion Model must be defined before serialization") - require(model.labelMap.isDefined, "Label Map must be defined before serialization") - baseWriter.save(path) - val modelPath = new Path(path, "model").toString - model.model.get.save(modelPath) - - val spark = sparkSession - import spark.sqlContext.implicits._ - val labelsPath = new Path(path, "labels").toString - model.labelMap.get.toSeq.map(p => p._1 + ":" + p._2).toDS.write.mode("overwrite").parquet(labelsPath) model.serializeEmbeddings(path, sparkSession.sparkContext) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/IntStringMapParam.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/IntStringMapParam.scala deleted file mode 100644 index 624f6520302612..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/IntStringMapParam.scala +++ /dev/null @@ -1,31 +0,0 @@ -package com.johnsnowlabs.nlp.annotators.common - -import org.apache.spark.ml.param._ - -import org.json4s.DefaultFormats -import org.json4s._ -import org.json4s.jackson.JsonMethods._ - -import scala.collection.JavaConverters._ - - -class IntStringMapParam(parent: Params, name: String, doc: String, isValid: Map[String, Int] => Boolean) - extends Param[Map[String, Int]](parent, name, doc, isValid) { - - def this(parent: Params, name: String, doc: String) = - this(parent, name, doc, (_: Map[String, Int]) => true) - - /** Creates a param pair with a `java.util.List` of values (for Java and Python). */ - def w(value: java.util.HashMap[String, Int]): ParamPair[Map[String, Int]] = w(value.asScala.toMap) - - override def jsonEncode(value: Map[String, Int]): String = { - import org.json4s.JsonDSL._ - compact(render(value.toSeq)) - } - - override def jsonDecode(json: String): Map[String, Int] = { - implicit val formats = DefaultFormats - parse(json).extract[Seq[(String, Int)]].toMap - } - -} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/StringMapParam.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/StringMapParam.scala deleted file mode 100644 index e895006dd5fd11..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/StringMapParam.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.johnsnowlabs.nlp.annotators.common - -import org.apache.spark.ml.param._ - -import org.json4s.DefaultFormats -import org.json4s._ -import org.json4s.jackson.JsonMethods._ - -import scala.collection.JavaConverters._ - -class StringMapParam(parent: Params, name: String, doc: String, isValid: Map[String, String] => Boolean) - extends Param[Map[String, String]](parent, name, doc, isValid) { - - def this(parent: Params, name: String, doc: String) = - this(parent, name, doc, (_: Map[String, String]) => true) - - /** Creates a param pair with a `java.util.List` of values (for Java and Python). */ - def w(value: java.util.HashMap[String, String]): ParamPair[Map[String, String]] = w(value.asScala.toMap) - - override def jsonEncode(value: Map[String, String]): String = { - import org.json4s.JsonDSL._ - compact(render(value.toSeq)) - } - - override def jsonDecode(json: String): Map[String, String] = { - implicit val formats = DefaultFormats - parse(json).extract[Seq[(String, String)]].toMap - } - -} From 214873ac91ce4c33eb5f6bd70833946c06a16286 Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 27 Jan 2018 13:00:54 -0300 Subject: [PATCH 52/55] unit test work in progress --- .../logreg/AssertionLogRegApproach.scala | 1 - .../logreg/AssertionLogRegModel.scala | 6 +-- .../ApproachWithWordEmbeddings.scala | 2 + .../johnsnowlabs/nlp/AnnotatorBuilder.scala | 46 +++++++++++++++++++ .../com/johnsnowlabs/nlp/DataBuilder.scala | 4 ++ .../logreg/AssertionLogregApproachSpec.scala | 43 +++++++++++++++++ 6 files changed, 96 insertions(+), 6 deletions(-) create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index c92d1f0333ffd4..c53c8a94242a41 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -42,7 +42,6 @@ class AssertionLogRegApproach(val uid: String) def setLabelCol(label: String) = set(label, label) - def setDocumentCol(document: String) = set(document, document) def setTargetCol(target: String) = set(target, target) def setMaxIter(max: Int) = set(maxIter, max) def setReg(lambda: Double) = set(regParam, lambda) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index 29f8a963906e2e..7313a40403617d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -10,8 +10,6 @@ import org.apache.hadoop.fs.Path import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions._ - - import scala.collection.immutable.Map import scala.collection.mutable @@ -30,8 +28,6 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS val beforeParam = new IntParam(this, "beforeParam", "Length of the context before the target") val afterParam = new IntParam(this, "afterParam", "Length of the context after the target") - // the document where we're extracting the assertion - val document = new Param[String](this, "document", "Column with the text to be analyzed") // the target term, that must appear capitalized in the document, e.g., 'diabetes' val target = new Param[String](this, "target", "Column with the target to analyze") val startParam = new Param[String](this, "startParam", "Column that contains the token number for the start of the target") @@ -59,7 +55,7 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS /* apply UDF to fix the length of each document */ val processed = dataset.toDF. - withColumn("text", extractTextUdf(col(getOrDefault(document)))). + withColumn("text", extractTextUdf(col(getInputCols.head))). withColumn("features", applyWindowUdf($"text", col(getOrDefault(target)), col(getOrDefault(startParam)), diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala index 76123a845baf89..539590bd00aebf 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ApproachWithWordEmbeddings.scala @@ -47,6 +47,8 @@ abstract class ApproachWithWordEmbeddings[A <: ApproachWithWordEmbeddings[A, M], // 4. Create Embeddings for usage during train wembeddings = Some(WordEmbeddings(localPath.get, $(embeddingsNDims))) } + + } diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index 0e67f45913aa83..af11d0472c0d39 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -1,6 +1,7 @@ package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators._ +import com.johnsnowlabs.nlp.annotators.assertion.logreg.{AssertionLogRegApproach, AssertionLogRegModel} import com.johnsnowlabs.nlp.annotators.ner.crf.{NerCrfApproach, NerCrfModel} import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParser import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach @@ -9,6 +10,7 @@ import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat +import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ @@ -168,5 +170,49 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => .setOutputCol("ner") .fit(df) } + + /* generate a set of random embeddings from tokens in dataset + * rowText is the column containing the text. + * returns the path of the file + * */ + private def generateRandomEmbeddings(dataset: Dataset[Row], rowText: String, dim: Int) = { + import org.apache.spark.sql.functions._ + import java.io.{PrintWriter, File} + val random = scala.util.Random + val filename = s"${rowText}_${random.nextInt(4)}" + val pw = new PrintWriter(new File(filename)) + + val tokens = dataset.toDF().select(col(rowText)). + collect().flatMap(row=> row.getString(0).split(" ")). + distinct + + def randomDoubleArrayStr = (1 to dim).map{_ => random.nextDouble}.mkString(" ") + + for (token <- tokens) + pw.println(s"$token $randomDoubleArrayStr") + + filename + } + + def getAssertionLogregModel(dataset: Dataset[Row]) = { + + val embeddingsPath = generateRandomEmbeddings(dataset, "sentence", 4) + + val documentAssembler = new DocumentAssembler() + .setInputCol("sentence") + .setOutputCol("document") + + val assertion = new AssertionLogRegApproach() + .setLabelCol("label") + .setInputCols("document") + .setOutputCol("assertion") + .setReg(1.0) + .setBefore(11) + .setAfter(13) + .setEmbeddingsSource("src/test/resources/ner-corpus/test_embeddings.txt", 3, WordEmbeddingsFormat.Text) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, assertion)).fit(dataset) + pipeline + } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/DataBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/DataBuilder.scala index d6600d82620781..51a096b9e8f3b1 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/DataBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/DataBuilder.scala @@ -27,4 +27,8 @@ object DataBuilder extends FlatSpec with BeforeAndAfterAll { this: Suite => .readDatasetFromLines(lines, SparkAccessor.spark).toDF AnnotatorBuilder.withDocumentAssembler(data) } + + def loadParquetDataset(path: String) = + SparkAccessor.spark.read.parquet(path) + } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala new file mode 100644 index 00000000000000..ab742b1c97ac75 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala @@ -0,0 +1,43 @@ +package com.johnsnowlabs.nlp.annotators.assertion.logreg + +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegModel +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.scalatest.FlatSpec + + +class AssertionLogregApproachSpec extends FlatSpec { + // load sample negex dataset + val negexDataset = DataBuilder.loadParquetDataset("src/test/resources/negex.parquet") + val logregPipelineModel = AnnotatorBuilder.getAssertionLogregModel(negexDataset) + + "AssertionLogregApproach" should "be serializable and deserializable correctly" in { + logregPipelineModel.write.overwrite.save("./test_assertion_pipeline") + val loadedAssertionPipeline = PipelineModel.read.load("./test_assertion_pipeline") + loadedAssertionPipeline.transform(negexDataset) + + } + + "AssertionLogregApproach" should "have correct set of labels" in { + val model = logregPipelineModel.stages(1).asInstanceOf[AssertionLogRegModel] + + assert(model.labelMap.get.size == 2) + assert(model.labelMap.get.contains(1.0)) + assert(model.labelMap.get.contains(0.0)) + assert(model.labelMap.get.values.toList.contains("Affirmed")) + assert(model.labelMap.get.values.toList.contains("Negated")) + } + + + "AssertionLogregApproach" should "produce meaningful assertions" in { + val predicted = logregPipelineModel.transform(negexDataset) + + val annotations = Annotation.collect(predicted, "assertion").flatten.map(_.result).toSet + + assert(annotations.size == 2) + assert(annotations.contains("Affirmed")) + assert(annotations.contains("Negated")) + + } + +} \ No newline at end of file From 4d7d9732fa05dbd1586961f61059330907372ca8 Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Sat, 27 Jan 2018 13:45:02 -0300 Subject: [PATCH 53/55] - Fixed bug in word embeddings write process - Updated LogReg to use read and write standard traits --- .../johnsnowlabs/nlp/HasWordEmbeddings.scala | 2 +- .../logreg/AssertionLogRegApproach.scala | 2 +- .../logreg/AssertionLogRegModel.scala | 55 +++---------------- 3 files changed, 11 insertions(+), 48 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala index 270ed4afcd1e8d..59b2ce7dc811a1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasWordEmbeddings.scala @@ -83,7 +83,7 @@ trait HasWordEmbeddings extends AutoCloseable with ParamsAndFeaturesWritable { def getEmbeddingsSerializedPath(path: String): Path = Path.mergePaths(new Path(path), new Path("/embeddings")) override def onWrite(path: String, spark: SparkSession): Unit = { - deserializeEmbeddings(path, spark.sparkContext) + serializeEmbeddings(path, spark.sparkContext) } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala index c92d1f0333ffd4..c6f95f80b7e763 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegApproach.scala @@ -94,7 +94,7 @@ class AssertionLogRegApproach(val uid: String) val processedWithLabel = processed.withColumn(labelCol, labelToNumber(labelMappings)(col(labelCol))) - AssertionLogRegModel() + new AssertionLogRegModel() .setBefore(getOrDefault(beforeParam)) .setAfter(getOrDefault(afterParam)) .setInputCols(getOrDefault(inputCols)) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index c02664cdd737d2..cc4d4c80880f2b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -2,12 +2,11 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType.{ASSERTION, DOCUMENT} import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.embeddings.WordEmbeddings -import com.johnsnowlabs.nlp.serialization.MapFeature +import com.johnsnowlabs.nlp.embeddings.{EmbeddingsReadable, WordEmbeddings} +import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature} import org.apache.spark.ml.classification.LogisticRegressionModel -import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReader, MLWriter} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.hadoop.fs.Path import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.functions._ @@ -19,7 +18,7 @@ import scala.collection.mutable * Created by jose on 22/11/17. */ -class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("ASSERTION")) extends RawAnnotator[AssertionLogRegModel] +class AssertionLogRegModel(override val uid: String) extends RawAnnotator[AssertionLogRegModel] with Windowing with Serializable with TransformModelSchema with HasWordEmbeddings { override val tokenizer: Tokenizer = new SimpleTokenizer @@ -37,7 +36,7 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS val startParam = new Param[String](this, "startParam", "Column that contains the token number for the start of the target") val endParam = new Param[String](this, "endParam", "Column that contains the token number for the end of the target") - var model: Param[LogisticRegressionModel] = new Param[LogisticRegressionModel](this, "logistic regression", "trained lr for prediction") + var model: StructFeature[LogisticRegressionModel] = new StructFeature[LogisticRegressionModel](this, "logistic regression") var labelMap: MapFeature[Double, String] = new MapFeature[Double, String](this, "labels") override lazy val (before, after) = (getOrDefault(beforeParam), getOrDefault(afterParam)) @@ -47,6 +46,8 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS afterParam -> 13 ) + def this() = this(Identifiable.randomUID("ASSERTION")) + def setBefore(before: Int) = set(beforeParam, before) def setAfter(after: Int) = set(afterParam, after) def setStart(start: String) = set(startParam, start) @@ -67,7 +68,7 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS col(getOrDefault(startParam)), col(getOrDefault(endParam)))) - $(model).transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) + $$(model).transform(processed).withColumn(getOutputCol, packAnnotations($"text", $"target", $"start", $"end", $"prediction")) } private def packAnnotations = udf { (text: String, s: Int, e: Int, prediction: Double) => @@ -96,42 +97,4 @@ class AssertionLogRegModel(override val uid: String = Identifiable.randomUID("AS override def copy(extra: ParamMap): AssertionLogRegModel = defaultCopy(extra) } -object AssertionLogRegModel extends DefaultParamsReadable[AssertionLogRegModel] { - def apply(): AssertionLogRegModel = new AssertionLogRegModel() - override def read: MLReader[AssertionLogRegModel] = new AssertionModelReader(super.read) - - class AssertionModelReader(baseReader: MLReader[AssertionLogRegModel]) extends MLReader[AssertionLogRegModel] { - override def load(path: String): AssertionLogRegModel = { - val instance = baseReader.load(path) - val modelPath = new Path(path, "model").toString - val loaded = LogisticRegressionModel.read.load(modelPath) - - val labelsPath = new Path(path, "labels").toString - val labelsLoaded = sparkSession.sqlContext.read.format("parquet") - .load(labelsPath) - .collect - .map(_.toString) - - val dict = labelsLoaded - .map {line => - val items = line.split(":") - (items(0).drop(1).toDouble, items(1).dropRight(1)) - } - .toMap - - instance - .setLabelMap(dict.map(_.swap)) - .setModel(loaded) - instance.deserializeEmbeddings(path, sparkSession.sparkContext) - instance - } - } - - class AssertionModelWriter(model: AssertionLogRegModel, baseWriter: MLWriter) extends MLWriter { - - override protected def saveImpl(path: String): Unit = { - - model.serializeEmbeddings(path, sparkSession.sparkContext) - } - } -} +object AssertionLogRegModel extends EmbeddingsReadable[AssertionLogRegModel] From 3b3cad0c168e590b9bd634ef29c31c865d83120a Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 27 Jan 2018 13:57:49 -0300 Subject: [PATCH 54/55] unit test --- src/test/resources/random_embeddings_dim4.txt | 2213 +++++++++++++++++ .../johnsnowlabs/nlp/AnnotatorBuilder.scala | 11 +- .../logreg/AssertionLogregApproachSpec.scala | 4 +- 3 files changed, 2222 insertions(+), 6 deletions(-) create mode 100644 src/test/resources/random_embeddings_dim4.txt diff --git a/src/test/resources/random_embeddings_dim4.txt b/src/test/resources/random_embeddings_dim4.txt new file mode 100644 index 00000000000000..43a9b69910b248 --- /dev/null +++ b/src/test/resources/random_embeddings_dim4.txt @@ -0,0 +1,2213 @@ +**initials 0.44105335304332327 0.5721092233315418 0.8415486011009214 0.5039530066673547 +_______________________________________________________________ 0.26250254090306857 0.6708840902301043 0.428391236430646 0.20474884915685854 +final 0.13130287462682 0.7277891440239602 0.43316307658885045 0.30460725315411763 +diagnosis 0.9076976012192696 0.13794145234500588 0.7322121647182661 0.37095428530524843 +: 0.729098602863248 0.7536973339652907 0.031534423238633646 0.12917413098668573 +thyroid 0.07216977447764261 0.5240778409802122 0.6233535294648875 0.9355579107438723 +, 0.8489306103014239 0.7986240515080344 0.8477569267251297 0.22500015658579975 +right 0.20536792700048245 0.8149994873441988 0.2408640883172336 0.5665300940953625 +lobe 0.1998321017206317 0.8125091607109918 0.7504164249198017 0.39824157730606247 +and 0.07853361416470639 0.4178057765323866 0.33034321158662683 0.7329115411610758 +isthmus 0.4367444875322467 0.4344054831970402 0.6877714342564601 0.12657448079103084 +lobectomy 0.1568486757243427 0.9342370387531259 0.9491114549527117 0.9845457741131992 +isthmusectomy 0.6481739491595864 0.5972999220522055 0.5362649142341935 0.8428641494189791 +(22 0.5785774601311279 0.5746082181827189 0.5996649133778412 0.6078496565975735 +. 0.9840458346970099 0.7599489088742579 0.9417727522448 0.8624503199123044 +7 0.37010987645584803 0.2475848904639707 0.4828195520408002 0.2552773228312475 +grams) 0.17766300335715968 0.9258840582520981 0.9699952781547421 0.6767043562577431 +a 0.7758115805464789 0.6109579694775961 0.8119514903064007 0.3961346457422458 +multinodular 0.9656743750509815 0.8597447819498399 0.6323982857097098 0.5855145564261147 +goiter 0.8315875163775017 0.6185729411016184 0.3797401423447767 0.6863996692907884 +with 0.6599601090832197 0.161093471405374 0.6041093561017523 0.8913561151288232 +dominant 0.5610652011828349 0.3485703296665411 0.818819557117567 0.6052058494468028 +nodule 0.37533940753770356 0.10587008277370458 0.7001762364385464 0.12760168742834876 +(3 0.4233037741610831 0.1480933018766516 0.24774510721396914 0.9556441324599935 +6 0.6670074641926698 0.4156420236974878 0.9265931355099859 0.04372888917544593 +cm) 0.8909351451914811 0.3796189872453668 0.24368117472766015 0.07754005179563728 +02) 0.5816074725840483 0.3648058109566146 0.1725013990190185 0.3200811186745377 +mild 0.2876437029334864 0.30406938256207994 0.3332162907535706 0.8698621778461747 +aortic 0.37584435561799134 0.8858963907286844 0.8599707321457185 0.234837849285504 +regurgitation 0.2638904642629829 0.8858584268889959 0.4427119950936975 0.9704183029574401 +left 0.4065460483941834 0.5831609407895031 0.11776726024816642 0.7558505016918737 +atrial 0.5397881467697926 0.5550313455734139 0.9995916715852663 0.6904952711965086 +enlargement 0.9460789891906322 0.814552875022096 0.2955146179208683 0.7800005000290351 +to 0.7940334560513885 0.21165248469549736 0.6259754691782344 0.5520990961703217 +moderate 0.9270236875243821 0.2523025491884031 0.8846543795660256 0.4927353747333131 +mitral 0.8278026746467351 0.0075979058003403654 0.6160677653411244 0.5254512688714906 +no 0.8700168230731734 0.5822208726198898 0.16261202914896944 0.0704394343491428 +valvular 0.3825169114003748 0.6909091336977262 0.6387033042739192 0.3143977274970847 +abnormalities 0.49866624356138833 0.25189215341788174 0.12657486424945819 0.19829512992550768 +nondilated 0.24038315025475043 0.5478640813092287 0.35645822063784827 0.28738674139409137 +ventricle 0.7954448013052468 0.4279122104565062 0.5332877117051834 0.12629134366622985 +hypertrophy 0.4866175988219147 0.2279215815989517 0.9027323569522799 0.9016717712954236 +grossly 0.5024694784769799 0.5723283405493693 0.11127328954344162 0.23603507564256632 +normal 0.062257214962294305 0.9866955580181411 0.40820880969706597 0.7183957889681587 +function 0.32107213566371384 0.2984369729071744 0.2618828466281553 0.34102233554960815 +ventricular 0.9864659936780474 0.13211553917144503 0.0180975071513253 0.3562130513826599 +size 0.31406783459189436 0.41609568120379914 0.8155837582560083 0.1930249889864163 +paradoxical 0.8772421384178412 0.7089290350449956 0.09156843363204725 0.9811049540852949 +septal 0.2646272558574976 0.32670548223558105 0.6598902381201022 0.9172494322966154 +motion 0.6494322446151714 0.7067028306813775 0.5247811019978261 0.18004566831076008 +consistent 0.9956594531055021 0.1780198307210128 0.30534321157554667 0.3489236293825452 +post-operative 0.5189590293389853 0.3264788860050084 0.3889166117874241 0.3161744919905296 +small 0.9036109057053229 0.43893657450172063 0.7059458608026807 0.49396840688142996 +cavity 0.6921684890095677 0.0912412196801623 0.32461829136144515 0.7131303733242776 +severe 0.21058753604877523 0.8044289146134774 0.900461013462205 0.5864097422823428 +overall 0.3692031159160011 0.8353502182138034 0.3842471645592731 0.26620576803879037 +preserved 0.6887056013632056 0.5907720442700604 0.2909764458536692 0.4651877551079099 +systolic 0.9403071821480288 0.8581003980081715 0.2633588884400634 0.327975117637866 +03) 0.8605778802971986 0.031221146150250956 0.6251447564593332 0.2430191580912736 +annular 0.5039703715013444 0.7765579498165771 0.11280791833220571 0.966055057613651 +calcification 0.929841310040656 0.13229748908601568 0.9985035158063953 0.48996900513213004 +pulmonary 0.08581602873945782 0.8628343540776703 0.3728854201001781 0.9406519781990393 +artery 0.9456325910942037 8.386835217757405E-4 0.443282445387156 0.4043882619943605 +pressures 0.18714041852986874 0.9009522848229303 0.32089708073397727 0.999553502739201 +thickened 0.9549173864585028 0.8019692333457722 0.9201035625381072 0.6938732823982323 +valve 0.5420088536557668 0.8484599639727719 0.2735872850159855 0.3382201693635365 +04) 0.14658026565776838 0.7581478532453723 0.7789806789390672 0.13890357390865893 +borderline 0.21181553930640395 0.8379504295851732 0.6358154591931693 0.322633277539098 +hypertension 0.2411807148942705 0.8494476221320958 0.0060244802059948865 0.061814033130039636 +flattened 0.39621279188168357 0.30364173831119046 0.23689138468559623 0.8225554757594159 +interventricular 0.6302822072859267 0.1252253747329809 0.8222593581717654 0.1737577589376884 +septum 0.5893162034426292 0.26817451346720145 0.645975215703941 0.7528601041038353 +pressure 0.019260108969176293 0.7873364391469735 0.4509583499884111 0.5223160777417895 +or 0.9600919392344991 0.07478295583578864 0.40794884734477466 0.6705857412614323 +volume 0.5774949754349943 0.4550511150123767 0.1199024227003348 0.512411628743565 +overload 0.08583247864389021 0.09999575095684665 0.39116467351011064 0.5253719772051937 +tricuspid 0.8498570958366162 0.2718288436218542 0.6149702860586475 0.2231809569150821 +vegetations 0.4176269050602096 0.23202290806443915 0.6884780012403938 0.910932838758789 +seen 0.9333892047152847 0.7641068893141166 0.9514650094790383 0.4934627671361853 +by 0.10673206245343891 0.149984912276925 0.7629694960835316 0.41522409040729535 +this 0.6730927345948513 0.6122591482026574 0.9795837227708772 0.49855835427520767 +technique 0.070750944908889 0.6891405008673596 0.7797637239234759 0.9169606364285259 +saline 0.8807246704525715 0.7438816875038314 0.9904366839905394 0.7909199677661778 +contrast 0.42852430165474586 0.8272690354397609 0.31535012663805584 0.9261401498387767 +injection 0.830510789298862 0.4095612201205099 0.6390218077821366 0.027423945844138098 +without 0.6826420994945849 0.07736410121321047 0.8858128769891759 0.3162294597034455 +evidence 0.5356815635623968 0.15128287187444356 0.7505611238231592 0.8558842960349066 +of 0.2784805644103343 0.3217942977887971 0.9853753466180533 0.045669710547629916 +intracardiac 0.5597372342196258 0.8236329617558299 0.8266052065841503 0.6834634602033499 +shunt 0.4186717199193485 0.37308753222641056 0.5766661317375112 0.7163566093503199 +05) 0.36634802689128954 0.0779539795976637 0.31731193426120596 0.2996361717244981 +root 0.2619108980819095 0.7964952332918072 0.22411221393180536 0.06675201972645195 +dilation 0.4617958752839836 0.969607012156787 0.9763059235642289 0.5763628823112482 +06) 0.8850092226992067 0.7794924326225057 0.31186601439148676 0.747283226231022 +7) 0.8338639604754715 0.5984985710412205 0.8645246182401174 0.7181288225251795 +pulmonic 0.6370968013684315 0.08337024343164612 0.037672209566423476 0.9756859272238455 +07) 0.4410802441572854 0.9430277812708491 0.8194576089818163 0.7845050766466788 +8) 0.7648830522403335 0.36207682405939845 0.1240810293115795 0.9199474207901959 +there 0.7562211329699586 0.2976102357105318 0.45618508064602614 0.4578878923930927 +is 0.4378804460382566 0.383463185627702 0.4384136149360296 0.9065061092111331 +an 0.6179818618263645 0.9739347051307065 0.01662198678223692 0.27907246460007407 +echodensity 0.5524978880183252 0.9380898938550197 0.9628863742966476 0.8013635853130784 +in 0.8947193831812156 0.176109076189785 0.6333740187592377 0.8233174059042571 +the 0.9439099533745765 0.4707513297606908 0.8063001572161052 0.1617655528503179 +atria 0.7141745513078734 0.20985172944515318 0.824611529907514 0.5395561658941704 +patient''s 0.8846082082192859 0.7198731048301513 0.19870056996842878 0.006107956008697624 +know 0.26307923207833295 0.4358843795097801 0.2863679215479372 0.22541185763668548 +interatrial 0.9985410159067986 0.8999396157455707 0.6604158854111043 0.2997242647103906 +baffle 0.4105551943627539 0.6276830886501523 0.29866846506064504 0.36067120915352147 +08) 0.969747934171244 0.2121074460118838 0.7868784875273651 0.5042105366771574 +09) 0.659210629611243 0.6437433343815074 0.311896044512775 0.7718746873509732 +10) 0.6743948569772863 0.7181610258880593 0.3406363591117174 0.7879442550860445 +trivial 0.17784064164419722 0.5745153133648584 0.529799920658819 0.3506781021954527 +pericardial 0.6151165057431864 0.8629670402645478 0.5919607078245314 0.7454531678941707 +effusion 0.9196380971062951 0.7741662785831969 0.5555537698642643 0.7986429100013603 +echocardiographic 0.7865721481638496 0.18058330299275405 0.09945603818307647 0.7232907311635747 +signs 0.0798208433343206 0.43732541409853376 0.4172694340424138 0.6211105757779427 +tamponade 0.3511210524355828 0.9691789970894096 0.3720789360781822 0.1000738385703892 +are 0.39658191506190343 0.630968081620067 0.5393722253731201 0.8428180123359783 +were 0.7535235923631415 0.9699218875629833 0.10397182122983872 0.11833962569383116 +stress 0.0492683418305907 0.9415954572751959 0.47624463167525755 0.16790967216778263 +induced 0.1535748762292387 0.33498936903209897 0.9235178224122094 0.1158772920395934 +chest 0.1576957363238768 0.14786571481083932 0.474544020332508 0.5570031533391828 +pain 0.30557634552051616 0.4198590666418104 0.8480230979165965 0.2541912600065497 +ischemic 0.13650386571167406 0.5788332877068361 0.1806011940948643 0.4439639922645017 +ekg 0.26355228344538006 0.1850355916773806 0.8854329423473457 0.09175190432400204 +changes 0.4084603787712713 0.06795213546088719 0.059305448809491734 0.0650464822514577 +wall 0.5411170146138244 0.24236950565754178 0.4744877414489088 0.599690312729414 +at 0.23061830065419486 0.6629995390762398 0.34088303855741675 0.6230562130387652 +relatively 0.6048994452420027 0.8311249527458527 0.704725138410795 0.4675927824226356 +low 0.6372338794691519 0.5020764329679546 0.5619746758979393 0.2591909243441787 +level 0.8304307028999284 0.2589518428987754 0.5077346298108198 0.7970520867612693 +5 0.753350895739371 0.5158408504730245 0.017418801333885292 0.055215912538150724 +/ 0.2994017114295817 0.5929194106675374 0.26107094719280377 0.5306310453698313 +motor 0.9829577581341993 0.8661302592121877 0.689146465766434 0.5386299799638198 +strength 0.06719636319425282 0.6357666967962755 0.4962322114711285 0.21389499254309907 +all 0.8256236082919789 0.2811294752474076 0.46859452279973246 0.41347859070158977 +extremities 0.7796070572311751 0.47252678498410583 0.6521094322251423 0.9381694594272644 +8 0.32551960167187066 0.6972402824601577 0.7390429225099673 0.48946994662753596 +cm 0.7676384219304716 0.12268078053787257 0.06531339122988733 0.5616791209668979 +(0 0.406731818219646 0.8678088574986108 0.6342826470769146 0.032050908334221084 +7-1 0.9993058492514353 0.0829978342919988 0.6387851768123327 0.5567243156507651 +1 0.5987718282240829 0.8488672980822262 0.5400205492792166 0.006098288498639803 +referring 0.5173347112532322 0.7962174145890274 0.8800967287545542 0.08055143491365857 +shortness 0.9061913000937626 0.5220457945960918 0.6544271350625027 0.9358823156060062 +breath 0.2416668486981436 0.33073010834553385 0.2508475336731747 0.11757349767786751 +two 0.8262068169919337 0.5648111839699665 0.6742336778414759 0.35395830970416364 +dimensional 0.2656804185712749 0.8064132061510644 0.018563778442439727 0.7905968567141601 +echocardiology 0.7682412816086589 0.5281886269400576 0.9227850112188885 0.06164274969914196 +was 0.025029369899880693 0.35177747908465073 0.05250618354663106 0.18871077698899508 +technically 0.3792880536561716 0.9321223190518328 0.4864539630994297 0.6737300942416062 +difficult 0.9680086570679755 0.6275809626122981 0.25479278667628913 0.30949569111680764 +study 0.529003983299818 0.5386317008304186 0.17582402375061146 0.17010666237459293 +9 0.6063518399889916 0.2020664607238054 0.5682428761022039 0.9632065368079558 +syncope 0.9584378650901225 0.21291924884023794 0.37804621823925655 0.4269175858149913 +collapse 0.07536685995718562 0.31297886134379405 0.7523484857026788 0.7365875104223979 +; 0.8463002297391943 0.032586201301732065 0.11677661933348049 0.9499433700071602 +history 0.5823165287818479 0.8015183130736308 0.3687794351111878 0.3639744552551719 +patient 0.7966810653622362 0.5551125011524928 0.8861005681563193 0.2828420832717228 +**age[in 0.37194982549882005 0.2533859279938293 0.6678283790139965 0.10334097279458565 +60s]-year-old 0.0021213929779553276 0.40375709415381444 0.07485013553844477 0.8129135547257351 +gentleman 0.08809048610931047 0.6320777402086489 0.31720205023054127 0.9649447512506725 +past 0.585945116525283 0.5820348127759281 0.047720675289279924 0.9470418461664301 +medical 0.6601887198701932 0.939583741324666 0.16456498924278518 0.8902301474407576 +significant 0.37775752667561846 0.17692637834369707 0.19959321840079292 0.8804720433186073 +for 0.09374282815842228 0.6896476852921369 0.47524183304763545 0.22864341947214029 +esrd 0.6260163199044612 0.36097737219642745 0.5494684178422533 0.43107801161747994 +hiv 0.7934964476750384 0.22874434223989482 0.33648462736199414 0.11001687727673648 +chronic 0.4015818771800944 0.8534426561895077 0.6904186663003569 0.6617768266509231 +thoracic 0.29831820146197985 0.18993030351453055 0.169711451121365 0.5601778939522715 +aneurysm 0.6961524923698308 0.25063256267810907 0.16196461061306422 0.2637467554009312 +svc 0.5566693528373065 0.6147243842552529 0.5421221989033986 0.45743960375840964 +occlusion 0.4694914511836755 0.8856168053194388 0.561282171161499 0.9231043866495222 +recent 0.12156650649236744 0.5281743739156872 0.7190983094061983 0.6776288646439641 +pe 0.14935159860185354 0.1274427512087417 0.9596462378141863 0.14369773453483226 +who 0.8939449146036184 0.0022656967762696434 0.6487923377668323 0.02530830370299242 +discharged 0.3380040497368978 0.280023968432642 0.4405176945803603 0.2486711594917953 +on 0.2933289479148822 0.9928288893206891 0.32031370536446113 0.7988483587120451 +**date[jul 0.7292769875505475 0.49085548838934245 0.8256796092904956 0.5334586548916723 +2 0.9847160570525587 0.48510179179616164 0.6869168899719718 0.08700567966096062 +2007] 0.49001131413486343 0.3297119428284331 0.905828905184664 0.7928614106700484 +after 0.5405801954337102 0.7331027275316363 0.5955983973511096 0.1446641238044375 +groin 0.6098385699865873 0.9474848584976355 0.7420300585320281 0.6948663140162717 +av 0.6669013196021725 0.627971006917974 0.4145084865823563 0.44539465599271244 +fistula 0.9731882828649222 0.9753154705149085 0.8544367442813012 0.7238612015564054 +placement 0.8721268563962311 0.48043087316200783 0.29958435530299354 0.2584902313463834 +<> 0.15743037672860127 0.5741790616817606 0.36468775779864815 0.6659035901578901 +complications 0.7079716261231426 0.7525355023900036 0.2918104516707979 0.8481321284958993 +none 0.21105899022913377 0.29507722610909815 0.4083835891782738 0.8586986486339265 +diagnosis(es) 0.9609623272108767 0.5238466313200489 0.6865127607878351 0.6423438328024957 +1) 0.9158243708867989 0.6599943847464204 0.3840497621153347 0.6789430108159031 +stenosis 0.42363304916062006 0.7272068646631459 0.3660929037651546 0.9693888409235529 +sigmoid 0.2867183972141558 0.6237746159095618 0.8549932765735184 0.9896905573503291 +colon 0.12427767340125451 0.4392602280219511 0.03760835327047074 0.20926837438044377 +2) 0.3137205662769895 0.3270752252553428 0.17045833429215207 0.5203963569066641 +colitis 0.14416593434315705 0.7608776831708144 0.5666052370389782 0.4579757087679762 +plan 0.19921052794868654 0.8005531406500703 0.12248767891460377 0.4022951400216659 +follow-up 0.40112887357266425 0.07264359616915328 0.8486098022213923 0.05216284549317829 +today 0.5949531912346739 0.5304405489296671 0.20489561802412848 0.3317704888160651 +**institution 0.02842033282467693 0.7005595548168482 0.9096847065755574 0.5518762550342688 +await 0.5925629980129765 0.04566585254779387 0.2907593345404048 0.08437098914931784 +biopsy 0.22776040493926442 0.2771215333542767 0.17745663405880663 0.029017470761629305 +results 0.4713533523146237 0.9366764450661269 0.7948343088937556 0.3521296104046354 +repeat 0.7254674958903231 0.3329272106976042 0.11402131887103051 0.7591852995645209 +exam 0.45747761422544486 0.9661191823174798 0.637300437078555 0.12684252221713443 +______________________________ 0.21058482023460223 0.9039688813184058 0.652132170031833 0.6645554506771046 +**name[yyy 0.9982677731928554 0.1815351152511585 0.5752448820696534 0.10196366549137303 +m 0.38390784150811896 0.18976902582855493 0.12607899032366543 0.10545399877022155 +zzz] 0.305677291775559 0.190224785974331 0.7182880425376474 0.8157713391751809 +d 0.49681885989788843 0.5475141815451326 0.6386627656174796 0.8682811160018391 +i 0.1727338844424805 0.006439855786677273 0.2947588328090167 0.5084629338715329 +present 0.5088349979456668 0.3573436809073919 0.05567292155524273 0.4474409750025623 +during 0.3169092365489141 0.13105798512029132 0.984825516445926 0.9158341408106005 +entire 0.24277567485292584 0.5439494568140746 0.0692733588819816 0.9689343096431783 +procedure 0.7543110630177559 0.7738965479175844 0.3869883246446152 0.4721989684294057 +postoperative 0.1664561852222074 0.7896430729711845 0.021703284422089797 0.353269855935599 +esophagus 0.9586044298974146 0.3788221227109043 0.4343402135858856 0.6452066492120516 +hiatal 0.6798132269349139 0.7240462900772215 0.43069929055359546 0.5653508693415276 +hernia 0.808568237513172 0.8618112460790714 0.8270019397825419 0.37521179138969896 +3) 0.03636886734081479 0.6255615271016572 0.40804503545125026 0.04899294829079259 +erosions 0.5781932146888819 0.2221193930962887 0.7971375169846522 0.8418540146154525 +multiple 0.27793912773545226 0.4197781634052078 0.41265425424582036 0.6937295911516769 +antrum 0.47322765803190214 0.43671400572475283 0.772956807382194 0.030503282652295405 +4) 0.8114935188385326 0.009971271925682257 0.029594146274433575 0.9315613066732482 +duodenum 0.46807724340641066 0.7981278408624276 0.7581109157851764 0.8212272243433013 +5) 0.5070715954907774 0.2248489154424782 0.716454592897883 0.8113052490344067 +additional 0.4469649722246447 0.10588912209898171 0.4928869830586129 0.5729367676609696 +lesions 0.4456628856340643 0.9187040522511176 0.7084699604870988 0.14820080263888114 +as 0.9032577063410842 0.37168453592335093 0.713608591384819 0.4253660130264658 +scheduled 0.6959566119696179 0.00912769735095842 0.5237997402668091 0.09642802073566425 +__________________________________ 0.7008654808389884 0.9484113850725756 0.0766478933426713 0.7915430506752218 +stomach 0.748867121774826 0.4874391545094733 0.7603738035925466 0.7421947229837323 +duodenitis 0.65161969831702 0.2432751707333709 0.5634559977512664 0.7634077302160638 +bulb 0.48874152522982883 0.6869899429544913 0.12570914274721 0.5650489250725365 +liver 0.9125367292414917 0.4250888194285315 0.4330172995845226 0.4032324836632972 +disease 0.8301556589001313 0.3444974799269548 0.8911039130862406 0.8650724129676947 +retained 0.4568729102132595 0.7859297890467283 0.03813490530235386 0.14350279868478089 +food 0.3992047480665173 0.23956471816248825 0.2116151474287219 0.5036119039253725 +thegastric 0.12287727845028573 0.6673114561123117 0.0719523268148955 0.7679133613890334 +remnant 0.888194877097934 0.4475421497116253 0.003226576921652846 0.5552042848476152 +patent 0.9941485037214707 0.11102472818103493 0.0611186028759928 0.6961361450194399 +billroth 0.9626416937126904 0.49088830811176287 0.3186123406788367 0.7183516623169636 +ii 0.1155017674369091 0.6448065293274745 0.17200272892758928 0.5450166948465992 +anastomosis 0.8494822998614453 0.9241850387596551 0.41290350270937215 0.5828030352575811 +s 0.7982524152086229 0.8629909610490273 0.6463633248454259 0.18885499738558975 +p 0.9854070605511335 0.13796393026784093 0.9408629870738372 0.100998257506 +10 0.018822199642147375 0.7122998387063267 0.432880606250005 0.7914269681623552 +french 0.3542169629683388 0.4164997966742052 0.9768755294081863 0.8613311384916156 +nj 0.6645746491003804 0.6429285175836666 0.8309559257315539 0.5324029536065702 +tube 0.3921133242764896 0.5347279385641933 0.7991588890528013 0.010887461068746829 +efferent 0.6307855207611058 0.03508337341314749 0.21318610611513744 8.19422856218277E-4 +limb 0.5898259051284783 0.4294410208208177 0.6296024058366558 0.9253144890289109 +follow 0.02265498546830569 0.5611272910303894 0.1429179140710407 0.4782601928971779 +order 0.17048544495811824 0.7299220640939759 0.7706393320185492 0.9734778252915128 +set 0.9662673760388042 0.3934891907294008 0.45328962052508637 0.5844033702736298 +feeding 0.4721472497867458 0.1831696782768607 0.8305590457871105 0.9745624391707621 +meds 0.7824049602598437 0.4184940279408311 0.10416555496755597 0.007590121824581186 +through 0.9138718560589606 0.2588719382234377 0.9786654352604982 0.19527358215598345 +j 0.5271221588350022 0.866269496966875 0.8833699817171888 0.2930719121677394 +port 0.6742690957700084 0.4692848437302527 0.421687835058977 0.9847940455246836 +esophagitis 0.15298597661005553 0.016885539120928494 0.26862646600924744 0.9115920298133857 +large 0.4614117428609912 0.1891720098857046 0.5428505436209774 0.3951541423272181 +gastritis 0.794889417513484 0.8928831169732625 0.016609154821672578 0.08095786826612683 +md 0.003025043792668214 0.0109615521617904 0.3950073405887232 0.38467042747816094 +helicobacter 0.9343437484007749 0.0983482730944294 0.2955121477588979 0.06326022358143257 +pylori 0.135858453835063 0.9202765217788114 0.489247114390417 0.45210959064728573 +status 0.44615621087386415 0.2146880318552138 0.4275430178288959 0.9103958139645146 +treat 0.6231456808179634 0.8269951015872083 0.07037713110377908 0.7992336027404893 +if 0.7813989792096445 0.413264021113813 0.5577470970172816 0.880549230724392 +indicated 0.07905932062662901 0.23699686584295065 0.3206896896494811 0.3693826028176287 +protonix 0.7905760752275935 0.19504032019499395 0.8155684598341756 0.9982906379294048 +40mg 0.6290665214639177 0.09975138117587345 0.1834190147742344 0.3597514199730859 +qd 0.940894239396124 0.41740188037849013 0.7855539589673325 0.4809454226264439 +egd 0.1850915960846966 0.11690514665358087 0.7472941403226565 0.05501646289519424 +needed 0.9315439996211321 0.018299271792743044 0.35360668580149024 0.0038920906184158888 +trace 0.46439211588129803 0.04702363452805025 0.38359434368502954 0.07996874497046946 +varices 0.9608972573913114 0.09787664682363795 0.24519490446364256 0.027444676717433447 +distal 0.13105466679620636 0.7916440048844611 0.07711321919620417 0.41880953177961755 +portal 0.14952772958125715 0.267738711187241 0.8219228098024488 0.6579757782037393 +hypertensive 0.6890163464553192 0.29779538243925263 0.6424377055036019 0.798764952545972 +gastropathy 0.872960230310802 0.4280131382907679 0.9528694336329645 0.765936592166562 +throughout 0.005518577378740175 0.5621426039830625 0.7669483111417603 0.28567565260139705 +angiodysplasia 0.008055347228442034 0.3823646725162123 0.027101714378780617 0.3334302368202624 +(spider 0.5938991293979347 0.27190960200923064 0.07194678627615214 0.18921770177043973 +angioma) 0.3469355338702007 0.9511170813793787 0.423768315213076 0.25399453630629765 +d1 0.034628814377492545 0.8677390706310012 0.6333174625019768 0.6254112157173797 +d2 0.6823590010984346 0.49263931395745786 0.006180094071961184 0.05725040751332666 +dr 0.16349920085574843 0.06364418444733466 0.6531285459135616 0.3340728232005701 +**name[uuu] 0.31844763043760493 0.9931520478896324 0.4562837584875541 0.2810723786032727 +40 0.905867193304314 0.10920897920755934 0.22357962258297337 0.5988681010025938 +mg 0.01067503154119609 0.1351109301897846 0.5159347799775208 0.463937251115799 +po 0.8858632632050268 0.3291935598155471 0.3747015797070895 0.36762056396604437 +- 0.5890212869817735 0.004322716527536907 0.4513758699705218 0.9801071745686883 +year(s) 0.1894691393914686 0.7257001708289365 0.8472151941111512 0.9205330562319729 +5mm 0.7682002458280399 0.3459504726556658 0.30732733787345656 0.19030365951664818 +sessile 0.9103713648739445 0.6780567345783416 0.2819547776325607 0.6839781087590695 +polyp 0.898865976256107 0.5295644077185688 0.7670953873907891 0.6305282689948056 +transverse 0.35749716089443107 0.02557383990800799 0.7672628106979802 0.03751334018861863 +descending 0.6637044398245189 0.6333940635355697 0.6851049026698435 0.21168670057393124 +6mm 0.05313846153799695 0.06833143648556517 0.9357622292889779 0.21177823516529803 +rectum 0.9723165431748276 0.30793877977102957 0.6082626388213219 0.6526724236367502 +hemorrhoids 0.9005706565047534 0.27115182117936354 0.0519122742575342 0.6249285038387316 +otherwise 0.5587992034819628 0.32909191321589026 0.8525572864325481 0.6195658464948988 +colonoscopy 0.15007230024961193 0.705914553052859 0.7817543223174962 0.25526430450180104 +internval 0.03593009912845624 9.35857752486946E-4 0.9513026747999755 0.293605572836659 +depending 0.04225179610268026 0.5173701889483351 0.5862539131470969 0.8977085256400467 +cecum 0.40986300269708686 0.8183634400848038 0.5414301886176921 0.3405878189825736 +primary 0.25758755944673606 0.947608751880908 0.9296451907629507 0.401791506205709 +prn 0.3092392708984556 0.04968234292523932 0.004236819310653339 0.5043504247792078 +diverticula 0.5510878250268021 0.7043390568053053 0.7441840704581921 0.23879818922506624 +fiber 0.43262245558328405 0.14935867461243735 0.4269949630022797 0.29699740294828747 +rich 0.20097619268632905 0.708135573386162 0.8427163122122618 0.9957343185206174 +diet 0.9681615732030417 0.1610559717728025 0.701830334151084 0.5085206161836734 +diverticulosis 0.15030950128810228 0.5038086150582 0.22046684911791292 0.6437675767992186 +internal 0.9893899000111486 0.5211258508578023 0.8282742633081578 0.5084745789674657 +external 0.9704339537510277 0.8710544818529696 0.43745643997423345 0.6788001929313361 +terminal 0.08106507623751746 0.9227064536086439 0.7358963680505736 0.5623608765401497 +ileum 0.025185896696396037 0.4195414878395505 0.5229521510534474 0.4818290135300697 +continue 0.6989571406595226 0.15058528226003953 0.19342424552214 0.635378026632724 +surveillance 0.4647434384051028 0.20358838889718278 0.09335766913506738 0.49674955920453223 +yearly 0.017490345857385847 0.04698227896200369 0.6207561383836657 0.40619368617765506 +hemoccult 0.928035345832325 0.6415912683270942 0.62494994336458 0.0732601127503002 +rectal 0.2691455469404185 0.018409573868041873 0.8702728724268802 0.463394693330472 +exams 0.08261306910500599 0.275355284262786 0.07286798666688266 0.07252208398004067 +but 0.829528648953663 0.7437641512494477 0.29385547213958285 0.9072719587775538 +poor 0.38089266952477674 0.15448735873778507 0.2777607738559086 0.583233138041285 +prep 0.5520123304344646 0.993809820831164 0.6620144470023658 0.8295957488159896 +gross 0.6616724175452763 0.8810062496230291 0.8514997020922104 0.3457690413518095 +description 0.1058562973605206 0.7222043637293136 0.7515373559701573 0.3940381581142238 +specimen 0.47583268028431003 0.5993260182376645 0.4833625135770344 0.7320506002408877 +received 0.3217131284967433 0.8382284562526945 0.6680604715555765 0.4898987393224622 +unfixed 0.26512397679851574 0.4667695201099402 0.858094511213106 0.37879211581344807 +labeled 0.7984254059667397 0.3213725762244698 0.21997111371444455 0.18178951374684515 +patient's 0.7729499026373614 0.38976765602049634 0.6272534530305973 0.8217871117272145 +name 0.27502073757082934 0.5879379213916431 0.9100467953254082 0.4279359337533921 +initials 0.31605082425589026 0.06090109276111488 0.12524455960552605 0.7288883469957325 +lc 0.5218107446560102 0.26980786495351494 0.36123739928673226 0.7897495666303063 +" 0.7959946052187785 0.5819404366103923 0.7714514763766721 0.5793374607254206 +renal 0.7545002420801881 0.8155459554249672 0.24299677489137805 0.10532913762987017 +calculi 0.5793809129756337 0.20804126348559637 0.34986987095742117 0.8871013428559407 +chemical 0.8252130044510415 0.5739769486128715 0.014569757436585062 0.34954600739232555 +analysis 0.6256649583581682 0.9982500707285554 0.6076496042061706 0.31587091308815285 +**age[90+]-year-old 0.6943402334231296 0.27040687147088593 0.051641050883139306 0.15337041599916212 +female 0.4915073134367208 0.4074776513423872 0.8639519390848672 0.2833787687695958 +presents 0.937118112783036 0.035638642863837555 0.9727523127966706 0.7050946210122746 +emergency 0.09055562263386108 0.10509590348285314 0.5705278080257014 0.9026919576164659 +department 0.2888404245451224 0.3979392137540557 0.2318932629879732 0.006109358603960002 +flutter 0.337564889985449 0.5553144046577915 0.015423684791911474 0.3581360603705547 +transferred 0.4826129629649739 0.8981146847856261 0.29367372877939857 0.7038896433755049 +from 0.055762358612562624 0.1444931583191862 0.6112209846951774 0.06350698083114747 +condition 0.8835553844169846 0.23784963945041182 0.9574821428405103 0.9748464775303665 +c 0.7763631363034453 0.89054752192636 0.4592155466061355 0.2002263824911834 +called 0.6733573418018712 0.12146672612871934 0.6470373990694681 0.17970028383402092 +which 0.30914312716126524 0.019601627307299085 0.13094321747804316 0.48309747469148545 +showed 0.5890946829915972 0.20126307736574267 0.7317398432336527 0.2640328341921594 +fibrillation 0.21297526675464828 0.9450466972773722 0.8584579065556103 0.25131687145552184 +t-wave 0.4702121372943999 0.7865795326214381 0.19622790896522802 0.12667343612524007 +inversion 0.5452353003518131 0.43416033837445134 0.5747683162535394 0.2323993078748956 +found 0.11757903685720494 0.22675903678812548 0.224546570705921 0.4285759925088848 +removed 0.5189836157653264 0.9974431466144711 0.49583969452505194 0.5766710638678911 +snare 0.2185236550725138 0.1281505010884355 0.5153337591182694 0.5648428384191603 +polypectomy 0.946829064914506 0.7858287731816598 0.18556659963810318 0.009988425695861558 +bioprosthetic 0.3417530590843342 0.8153142676798987 0.39443133524071083 0.7725521120132626 +position 0.6003812185772649 0.5309238689138672 0.29512332765336957 0.20233765345797328 +x-ray 0.25606439373637013 0.6965854720293757 0.0413262846160235 0.5970442429096805 +does 0.8641972790110626 0.3134340321454786 0.4180978668618389 0.6999725390799307 +reveal 0.07675387565884784 0.4441516306481422 0.3958290927489426 0.18932441185590398 +edema 0.3131297298363135 0.19493199250382465 0.7528158432115745 0.01874552382830097 +ct 0.9936299140260967 0.14880063649211728 0.5613948811660757 0.15723697492968491 +angiogram 0.0919813576438483 0.9981244755965443 0.5132607009610916 0.7462893613863104 +mca 0.5845362865039728 0.8594952884523716 0.7714940344659428 0.9430678641463104 +cva 0.22637341924597276 0.770773601206199 0.2799962473127925 0.7507047814326735 +bilateral 0.9682920476895646 0.16793088741769402 0.2354384266234859 0.950231799633503 +high-grade 0.24373359648330317 0.7184295952057216 0.7324979617260908 0.8434109236924882 +carotid 0.3009540017047214 0.7482319582791273 0.4300692244282347 0.8916999601679585 +done 0.733309683870275 0.641130541151129 0.454299221015411 0.49381446678833485 +outside 0.3693083044871347 0.011944250378639776 0.6537130841256296 0.08282482057526486 +hospital 0.7241155690351174 0.15922591451217794 0.7523586099646979 0.443454080299584 +necrotizing 0.2854972286061892 0.2884786196642294 0.9088123689633997 0.651218018854052 +pancreatitis 0.00971550339125471 0.26347967765145863 0.6820549379501994 0.3172760424016603 +therefore 0.536687529487443 0.19116265448098135 0.42963613665727307 0.8863378646685667 +she 0.7987529622256064 0.9746095071492508 0.7192156172130943 0.5796974950310779 +further 0.8936864682246975 0.07905643219353975 0.42147788955636456 0.20144250853271572 +management 0.3201304418180354 0.14741834136656873 0.7705852840440498 0.4331135738287655 +splint 0.9324118777094493 0.7746889167700272 0.8277462777289755 0.43056500335016024 +placed 0.2617464555245118 0.7046743802594686 0.46414035577418555 0.5065046091360133 +neurologically 0.17212251352120145 0.9478707363223586 0.1913553214438547 0.43316673211892165 +intact 0.17169568155747206 0.8358959712061987 0.9106562334711733 0.43179976195543124 +performed 0.08431490494507332 0.8257400817009842 0.9268200079915299 0.22912524720056737 +abdomen 0.20083905338716201 0.9461974160027224 0.3779330294994633 0.6556530750844142 +soft 0.4761928507026366 0.1391975619965531 0.5694638367713385 0.13671276399443621 +nontender 0.8156493953201548 0.3903521253376129 0.5690864180846595 0.0842885790049831 +nondistended 0.0026160802362229507 0.46588676549711505 0.8181443687544426 0.636221734396586 +positive 0.36023949103638275 0.2772293272652012 0.4463094805904535 0.4090703209787909 +bowel 0.8104340835715979 0.3798761458865145 0.4379030934781386 0.9318935566194038 +sounds 0.5964783292019019 0.8797605697234663 0.7915121212963954 0.21045387497477586 +mass 0.5691114145324108 0.06819907909014034 0.33894591555090225 0.43480683930098296 +organomegaly 0.6011020488602826 0.4355921118568331 0.945244186155236 0.1879753722416022 +hepatosplenomegaly 0.39155083540976354 0.32440789681820226 0.3983603279543234 0.6393813435354352 +cirrhotic 0.6487257534029358 0.3735798241699836 0.4153083611292161 0.11393439408553752 +acute 0.3990203619911492 0.0792984963435367 0.8409758795951743 0.1161538642910438 +coronary 0.9596624699521107 0.25128244640368524 0.296469526476421 0.35842698273273976 +syndrome 0.9340907335142943 0.878961474125145 0.245773654216476 0.5934823760131065 +exacerbation 0.4414554751109372 0.9386097710046756 0.6926662785081275 0.08338498517673654 +low-back 0.6754890758775137 0.9703162114511128 0.49425164379229214 0.3508346964277932 +additionally 0.8030302266613452 0.8327816681358362 0.5831313622434865 0.4532779635383257 +has 0.1587757371575288 0.9042841651859168 0.2271516515587262 0.39631774747470494 +probable 0.6703722488241048 0.13975464675856197 0.8238268382411779 0.4601340731031961 +lower 0.4567497284902289 0.8167575161242877 0.6259544662448187 0.15110392815478346 +extremity 0.5826370392984658 0.5072717261725296 0.22560068978147707 0.2105358062883702 +arterial 0.023489697211265037 0.7975195248941823 0.5855492481296283 0.6134105813637072 +clot 0.9975983978497045 0.5446922156872226 0.900470656425467 0.7722473836823768 +that 0.9559362452915653 0.007272710658129955 0.594483428904936 0.22468909184916552 +not 0.6853415203335386 0.014757526987704495 0.3174417989591596 0.928129459442259 +been 0.1769221101827264 0.452931284511894 0.8891996843680451 0.9106794856034405 +formally 0.13910665442868964 0.732449177284743 0.7592641554439661 0.4349979147417442 +visualized 0.29355623717421164 0.581333945379132 0.6986656188224454 0.019987224291876826 +imaging 0.88219024956242 0.45609548779138664 0.12153637498308978 0.63508251831239 +based 0.13486820822324241 0.10153335854228651 0.6458029958822944 0.13283808738041292 +examination 0.7998092408381339 0.3058829046480993 0.24556197949792524 0.5177421607168065 +suspected 0.22261913831570102 0.6958185661694231 0.6221225494392566 0.9626676045947561 +admission 0.874835004569095 0.4949284393879718 0.22636416247577906 0.5790925084509961 +dizziness 0.6791852584497742 0.5296264190092456 0.7019704718801744 0.8232011176388883 +weakness 0.9563032883290247 0.4080693721309544 0.3872656483231248 0.07720258334923558 +unclear 0.32642066137910064 0.7204986287912966 0.9127930081948826 0.3836808667502599 +etiology 0.4396061837370554 0.31352652833411454 0.8624396479455795 0.5305164635658265 +admitting 0.9794050939310279 0.1762403144367345 0.927825498931801 0.6848963821850355 +thrombosis 0.9872724523383848 0.2425972946386047 0.9788383969359086 0.5571562912321751 +discharge 0.18898344340009554 0.4624365745614689 0.9976118312540386 0.6386741962311377 +same 0.6692425931730692 0.8225346771897651 0.878905155529057 0.40276285011875457 +acetabular 0.9370028688875732 0.6094831158392402 0.686760737911192 0.48348335700812906 +fracture 0.15154470162398215 0.09776864992743195 0.07547637277234986 0.7545300697637611 +post 0.038153979218013445 0.019287343046553107 0.23234523865569756 0.9188128863115483 +fall 0.812277441081151 0.6246260589699729 0.8802799611250824 0.6890895778369884 +adrenal 0.25232223927164443 0.8817961266215659 0.5903513137416334 0.890977625507839 +insufficiency 0.9138064973884632 0.374705308237379 0.8974360039560385 0.851901590145505 +again 0.36135583500020996 0.7615064821954917 0.4865822455958614 0.253008187054391 +his 0.21008053213615074 0.23206199557938778 0.02336604331882819 0.21580841651185179 +review 0.7141564844388856 0.2641153108733867 0.03842454693605002 0.7712820442467013 +systems 0.6179731165237521 0.11936198532428521 0.644517605749712 0.016406834678441395 +limited 0.9939990002987839 0.9434647401966877 0.7638823361085653 0.2015512308980154 +fact 0.8315567587145102 0.03815790024984944 0.9743095993777431 0.7118955285124908 +he 0.21198623301371144 0.8589389090905695 0.04428562343535147 0.40702366120171096 +terribly 0.008215290339067094 0.5584454518368568 0.7207053667517624 0.06505389481923174 +cooperative 0.18054560960761945 0.4434688484871909 0.35707945129917684 0.7411534297230808 +keep 0.9055008964327668 0.5426557333676432 0.5258192576420783 0.6950083777848279 +focused 0.9708731447477741 0.5923013679831285 0.84509977643946 0.16779728803528804 +agitation 0.09764204701384782 0.13923993198304763 0.8689633795614939 0.6964428365224209 +noted 0.10823183225408128 0.13268844375987354 0.03874225487708882 0.8413928889141493 +alcohol 0.15060853263045082 0.33100244269919743 0.37454026360364645 0.17390876946660372 +wine 0.5090099581534666 0.08465332359949329 0.7974306300537329 0.11981512288859142 +glasses 0.8846398397319781 0.5733388479480419 0.716188155497383 0.7888920426507792 +per 0.413446116889875 0.37298935993659654 0.7867904450198064 0.23256727800026666 +day 0.10314329916073184 0.3075348723029654 0.5585849749489421 0.6672922063830203 +allergies 0.8179268964618833 0.9006481816594749 0.057934697300196514 0.07696255035704791 +phenobarbital 0.4168678934420792 0.3842826790958397 0.22831715398176744 0.2649705779683126 +trileptal 0.16076354471191534 0.14367314854633284 0.5452675521526836 0.864952837499567 +codeine 0.04226038713706859 0.6825707303455535 0.05521967301720254 0.039023525170084894 +her 0.26535406682678864 0.6778345425872888 0.5492122932639156 0.5932604688253855 +include 0.7029097925956161 0.4591032861317358 0.5755936916704102 0.02243700139903637 +penicillin 0.7946365074800391 0.2971227455518618 0.5202699320716663 0.5594696933312537 +oxycodone 0.012986306382964052 0.21340607030751768 0.038707578914701335 0.6497787151681046 +known 0.7207403668646539 0.6342552535555981 0.7108145728018527 0.39673360126978785 +drug 0.9715814067203645 0.10377418613562772 0.9079881791728759 0.4315842306707397 +sulfa 0.563741673078658 0.4433545900988396 0.5110318873538168 0.34975329253044185 +drugs 0.44685479272727413 0.19921860757598964 0.2983080966370153 0.3250929871405589 +also 0.403428533452315 0.9549523946365799 0.5354068492764109 0.7646851099021342 +scan 0.7788949526635607 0.10256733725969969 0.5407754245933982 0.898990869125674 +dilated 0.917983126863645 0.9634768003064741 0.7761671331224559 0.01563234024547322 +proximal 0.880199123915393 0.729877758043226 0.31643233535333004 0.6398151588735729 +transition 0.9534863859363122 0.5573787280512391 0.39008820176424086 0.34037765621735827 +quadrant 0.4626793499669297 0.2813161276166778 0.05026830936604321 0.9976785998173944 +compatible 0.3781518005531618 0.6520537115736071 0.935066569914021 0.30801263452067495 +obstruction 0.2701005386625095 0.6886606313012779 0.5069548885010412 0.018968245897286184 +later 0.22519437551262433 0.7357153199443257 0.12286728878392439 0.8911203625515403 +complained 0.4448568917253308 0.07458438209414009 0.779707635218491 0.6636759672687181 +cough 0.37824116631068494 0.48837659113569176 0.4044521475798023 0.36516756627436353 +so 0.407993049421579 0.008874254104264012 0.11368187714123379 0.1550162796808554 +obtained 0.3031894482273758 0.18409543154240238 0.9542944504989226 0.5077624276593733 +be 0.8813760297633462 0.5967420352953957 0.25530835833127585 0.9589882861204816 +unremarkable 0.580419099162685 0.036640670433693345 0.8700871554986825 0.8204344723588087 +hip 0.9309243582933658 0.6843842699674061 0.2689751458163282 0.2415374501586398 +gout 0.24126502043284004 0.8987345828375158 0.6542500174394011 0.745592599098881 +strong 0.15980053490013224 0.7169466584936458 0.012461875416354906 0.9009739076390919 +radial 0.16265618342243848 0.9750283984373685 0.29364260382475715 0.10131754950731486 +pulses 0.4304731219477179 0.03404966041741042 0.712532841877893 0.523303371421895 +did 0.023827357271385075 0.4589669899621476 0.5038888629958351 0.20664519532738068 +have 0.7445548311368587 0.6380314142941342 0.3320947863981015 0.8581266876206816 +elevated 0.5790099490299131 0.25369892337350197 0.9547199863424736 0.41999677037487415 +creatinine 0.15932248359968548 0.7200638976761289 0.6703261473166268 0.6627647760497762 +baseline 0.25000798910890887 0.18718817508976826 0.5427413412454268 0.0634122379244374 +course 0.8698256304266052 0.4792993123444157 0.3871836196066606 0.0122646780436092 +had 0.292565715810559 0.3816838401120032 0.028030208862455552 0.9552432859601306 +following 0.8354896559736471 0.02020213389672698 0.3936923344578599 0.4229433177046892 +sepsis 0.04043270981015068 0.6877038457185427 0.6009702003294013 0.023961865305162644 +respiratory 0.5801197860703271 0.3632236394328735 0.29942616768417973 0.38712509462336797 +failure 0.9810334123536579 0.3426892404627683 0.1957287800155595 0.7257408599232394 +trach 0.04421214385131056 0.8539748814377756 0.8585840039615483 0.1667296086518546 +thrombocytopenia 0.6666464341495586 0.2644850060852829 0.6167840897793704 0.23846291616514836 +sheath 0.6184784019917976 0.5060545565249044 0.6060687412999187 0.9433656124239684 +hematoma 0.7150007792921594 0.6398038950199508 0.7949073165992261 0.6548137147792302 +intraabdominal 0.7374897642879801 0.06268775949217342 0.12497461688546563 0.6725876784676058 +bleed 0.4031945774593877 0.7178293528637141 0.7390069837899311 0.6471159881308416 +iliac 0.6623471606386181 0.812332168210576 0.6368898547642994 0.23997171385431193 +epigastric 0.6088428750276212 0.9691138273239638 2.4117721161320826E-4 0.3265605291965552 +embolization 0.33128326170921585 0.44340761254470284 0.8564202090528855 0.7666654888579659 +arteries 0.10236065204122324 0.019928873190665497 0.7148041537368487 0.2694904426241366 +secondary 0.4071030408726547 0.4733056100783407 0.30134014237622 0.1620705435349714 +ureteral 0.44428753372296304 0.02222301957089201 0.5417934849438307 0.933394757764175 +nephrostomy 0.900368973995499 0.03683475478906051 0.4997775186927538 0.2713117951601508 +ischemia 0.9915599536810602 0.3693438163835415 0.06256411624611813 0.9352678998335674 +related 0.9972370458702883 0.06925853399441462 0.7858384682507308 0.21515078574764224 +underlying 0.7666375578739975 0.9176803914303454 0.45797530816227916 0.9149910869626672 +hypercoagulable 0.9708498997321934 0.16908160127468375 0.6250781622738996 0.7699992427682875 +state 0.1933897182475478 0.12438262956713853 0.529937056002805 0.9563918885159767 +altered 0.3051933806167335 0.675740819924471 0.6612243115834575 0.885688884842226 +mental 0.953648699236003 0.7613131339281409 0.3836620515351553 0.7886383328131622 +likely 0.41032278664519195 0.7473574548153739 0.925559149105929 0.6886537506374132 +high 0.4683353534926289 0.11538855011380478 0.3370293731223767 0.2406237176838819 +dose 0.5578648182241067 0.47370644724241906 0.9003664646046752 0.5681815713486466 +opioids 0.7570641210093331 0.8354661376234904 0.8734663031722852 0.6709983853233354 +home 0.9157018149169539 0.30955953061610264 0.3778497570585442 0.19339004030472173 +although 0.6801330217260556 0.8359321023883014 0.7047544966455555 0.3465046968384138 +irregularity 0.8107443072273061 0.36355774159310894 0.30592787185967985 0.5585845522323665 +definite 0.8168051848218697 0.9171477609498565 0.26016665359054636 0.9912715201054204 +findings 0.5986981973800983 0.32777037508019524 0.06231437451851529 0.4046535115937361 +suggest 0.4837883870992662 0.012909968183943188 0.16834000140959693 0.9797244439775021 +instability 0.7697328124071239 0.08309581104444519 0.8482867523764875 0.6141574919534732 +mri 0.6478182272321944 0.22508187166424887 0.548292955891455 0.8957679356052118 +biliary 0.16895615107397122 0.26566131371844026 0.14355192698249486 0.4277683979461798 +dilatation 0.7362517733677831 0.05555025320176066 0.17599093143844646 0.4308956762600833 +cholecystolithiasis 0.4606034263934935 0.32937717897949137 0.41361410157841105 0.5797152023729494 +anemia 0.5995777653377317 0.4956660347450924 0.8428678110840826 0.8438855233926398 +anicteric 0.21484526251592673 0.8898023230408972 0.5110513443932908 0.6998919853147102 +anticoagulation 0.12974935215252248 0.8833236652849442 0.47270932240119046 0.4547349990385887 +given 0.24612178930177486 0.3844768148942409 0.4621071929619004 0.580285138363674 +conservative 0.28824601972915653 0.6878360777066644 0.8133042263992377 0.8954939385691397 +goal 0.08190885180655227 0.3348781401668889 0.5800032852864366 0.15337534910193018 +inr 0.5477706969761397 0.6327612824114054 0.7692701094033214 0.0906501477451328 +2-2 0.6908226152364009 0.7451572278249262 0.09637956382738977 0.2801711618732535 +apparently 0.0014884344161001373 0.8168368023621044 0.69245955835362 0.1897247989284646 +switched 0.8489687453507694 0.42281046859214466 0.7522681699011217 0.6955300652532957 +warfarin 0.6437126956238332 0.7431783665520711 0.1963531891428143 0.9247333175529485 +lovenox 0.7734425864322071 0.25644400238448006 0.04000716468818222 0.7084234030753259 +previous 0.5759457525176968 0.416898954793702 0.3108457629156909 0.11516833324590547 +assuming 0.9018863007326173 0.735786615389224 0.03750054460722341 0.5048013946928174 +less 0.14217944084899414 0.9213045437698771 0.04383170348624277 0.29316088628687764 +lethargic 0.9470095330770577 0.249107367626212 0.0262306649067926 0.1507145687866287 +arthritis 0.24736211872675362 0.13282262489382068 0.6659958708168715 0.6714760838151482 +assessment 0.6049795868861735 0.9431942990708969 0.33442942747115445 0.6433476505289137 +70s]-year-old 0.36293867311840944 0.4848201195914609 0.2852166562236419 0.9269993592195603 +sclerosis 0.5465876380454466 0.18105028302244175 0.09681420513139272 0.6076224224783477 +paralysis 0.9733694395155371 0.9404665315840711 0.5269865911505239 0.5223699389042483 +urinary 0.5700969058271239 0.8824626951717094 0.8722952086552295 0.48602566607402997 +tract 0.4675342464057932 0.5847987606195546 0.3872577512434099 0.5384173643899706 +infections 0.044679225336618056 0.2826024976657049 0.3110601718492637 0.31931841353000456 +incontinence 0.7289281873476948 0.4961439313202135 0.5741173636336768 0.2400574727789826 +foley 0.02334646968046894 0.8938097568006343 0.31244265116513614 0.46578278836111586 +reported 0.9285549374086159 0.8070563114180773 0.9047004595778289 0.8483042125343591 +vaginal 0.654282513819018 0.7057210818814303 0.36498044854267175 0.8566627462985258 +bleeding 0.2744242391143774 0.11492460599040644 0.573745509354211 0.24105340702906308 +however 0.7694825384781703 0.9088958325165508 0.010070088803057264 0.6354548681657395 +blood 0.7215594864295186 0.6524059776379268 0.3601993525721072 0.5552945671441871 +urethral 0.7796057460630096 0.6770970512724807 0.6206237783687195 0.27870553882520754 +meatus 0.896422039367205 0.8778430121631879 0.08827775567498541 0.9134074688686938 +trauma 0.25394747483435365 0.793383365459715 0.9361350373932821 0.5459707010192556 +time 0.7396579518787021 0.395819704020116 0.9093876639240421 0.7461992172004702 +taken 0.8144375208984967 0.155168329064713 0.045559320319557894 0.8565943766543778 +cath 0.6210740808974973 0.368012941862477 0.22131780001877688 0.4353383558388294 +lab 0.22257028919277422 0.443403068619993 0.759415839300696 0.9409624796287704 +distress 0.9307491921327732 0.8244957249245132 0.29110316255068613 0.24784230276182073 +decision 0.7881735159619735 0.4910389647971759 0.5625167043367645 0.8638878519818243 +made 0.40018659090305886 0.5356481211748898 0.8979704564264176 0.785274067681453 +admit 0.47892148938202983 0.13592618136108559 0.5887298702333708 0.7677828313986111 +step-down 0.6574146908168879 0.3548325183521833 0.5627701649186675 0.42030480714167284 +bed 0.35949923515454796 0.5933045572028761 0.30918201168518056 0.9332757810369853 +treatment 0.7699413556872928 0.19198927568059232 0.18030170688529534 0.44329840422020206 +evaluation 0.23932386940007255 0.6330430232675803 0.8753075707217769 0.004927718346981469 +coffee-ground 0.7794686286526415 0.028308947394494854 0.12093793270732056 0.8915741145090158 +emesis 0.6577291758917132 0.09678388873069788 0.3081502542809149 0.47970522798096993 +hypoxia 0.4822739744632881 0.6321374615420143 0.11795520132060067 0.28400380061069097 +b 0.5855199303320142 0.5401478974688966 0.9753231739925473 0.8058659370252815 +lymphocytic 0.9591641646188049 0.8862705369549262 0.506975209935469 0.7335998936829549 +thyroiditis 0.4718021255722219 0.0351569281706654 0.04703401769738613 0.6899130282244068 +see 0.6089082814925578 0.8867934014801031 0.6339250050836447 0.16075028389369728 +marrow 0.5149952255960687 0.4180300081639965 0.8725813479258294 0.45927461494306543 +report 0.6890559487766466 0.22362219121737714 0.4979211156186978 0.5797563751763255 +indicating 0.2869328778387078 0.5175090449203569 0.46872833490462884 0.5783646099994431 +subtle 0.6180469738113326 0.10290335462933908 0.9454844970796994 0.6813307967203672 +involvement 0.08895820421233813 0.14887089153576105 0.5613602651862887 0.6656811281817288 +b-cell 0.5425189487162284 0.22142706128834155 0.9824921013844717 0.18371064150875105 +lymphoproliferative 0.6689200184527662 0.9087813529360839 0.8718318279947677 0.6095988087120899 +process 0.9237185861054399 0.5532111253907425 0.020271604799875997 0.6750427764907357 +(***path-number[1]) 0.1779761274113476 0.056147980075421655 0.5894108407574392 0.6305111347231228 +back 0.14692355237077237 0.4584116151670069 0.1222454136293546 0.9196973703733986 +reveals 0.49611002665365234 0.014127504743800978 0.22122409554191602 0.38697917326381503 +stepoff 0.40778064137041614 0.7747162485357738 0.567530264498332 0.8511658483445267 +tenderness 0.30526730445148864 0.2784443524711111 0.26111224863801474 0.4735429395599323 +two-dimensional 0.37946677583574306 0.8013770814066313 0.29789859083175185 0.7241996804801447 +echocardiography 0.5390177292535022 0.767176510826652 0.3687993530470818 0.9078514588685755 +slightly 0.4145790444120757 0.3360385219244809 0.0645080841630653 0.1488459523791995 +it 0.08764966999711399 0.8971648106215648 7.837698544398419E-4 0.4547681394337526 +98 0.28565986676529087 0.1745170354992036 0.3408053610986267 0.979036340845473 +60 0.8876027870941086 0.9222020686768909 0.6843563786221422 0.8851072840135532 +personality 0.718263844847408 0.7336887750791852 0.31801036574248753 0.27838549531331036 +disorder 0.7741453826823166 0.318158261423232 0.37615266631560096 0.5166583907386734 +both 0.22047211708153958 0.7076413152478978 0.5495876046700613 0.8358551274622648 +kidneys 0.7262026236471855 0.25019791722444085 0.596644577648375 0.6351541378281028 +otherwisenormal 0.9627610343296765 0.917245836360803 0.061110270560940316 0.9539400704875709 +brain 0.3822419142469715 0.4072461872639568 0.2819785945999743 0.6428385613606905 +parenchyma 0.712130817941494 0.7633060490604863 0.9919238358670812 0.48203257321864346 +appearance 0.9561215796912851 0.7388594923135702 0.5445663973775425 0.03229158139872401 +brief 0.07440084738429487 0.9434398254076724 0.08501520987953926 0.317621053299609 +illness 0.9318092641665912 0.8796574957958764 0.7838340157827314 0.5331660652872696 +40s]-year-old 0.5485776922376941 0.3428501920444843 0.5914936605903583 0.8479797012839401 +complex 0.1658761674054947 0.296471994986397 0.8385360393214659 0.029493754788764814 +admissions 0.14720209646890525 0.25914159947353166 0.40608973627382994 0.05500477987758745 +admitted 0.2766734115347028 0.443369652129984 0.12204101108475074 0.09521640233880246 +**date[feb 0.586581335246038 0.371337981308156 0.5000786638552084 0.2558544050634518 +13 0.4490967740572036 0.45970441808780416 0.9319942384620901 0.24559806310298593 +2008] 0.30081289143494516 0.11325859953381001 0.9680342578254196 0.7020477203968155 +20 0.03384607125297312 0.19635590718784623 0.3757643021778636 0.25217513398995617 +lethargy 0.7835060732027064 0.3614080033713799 0.2548195112960472 0.5096796775098711 +alveolar 0.5700854651066511 0.6053101346887948 0.6030795041628121 0.11890013845453418 +hemosiderosis 0.011107527323865773 0.5975367012794576 0.3792204468455158 0.643408071722445 +call 0.11041073217733977 0.7777388382773082 0.2872604984980722 0.5671357960920137 +any 0.9517369939410294 0.9560281211777323 0.22062919758507926 0.9268683677942188 +weight 0.977030132597415 0.16573347463358024 0.5464554428014609 0.12715511508722865 +gain 0.41343968205903414 0.8162686744953437 0.1851825906217307 0.30217353841025385 +greater 0.3280647760544714 0.772850298749737 0.7462836237208068 0.1838044414002018 +than 0.7357612464458434 0.8501569834141071 0.8177627174367006 0.8111540555543484 +three 0.16287484727181034 0.348622743695819 0.8752944079940898 0.5739850462898045 +pounds 0.0785518718792535 0.6849084685122611 0.1762653237724071 0.02099535421958132 +five 0.64531807212933 0.8721973371545468 0.43464691394764055 0.5836100742985278 +week 0.31190557207243375 0.5913116020478101 0.03454280902032836 0.8669281991832152 +unusual 0.8738464516641004 0.8602493798702002 0.033417485860161134 0.44169264936663044 +rashes 0.7611410378540326 0.8575920911423365 0.23042755594395092 0.13462859629247803 +nausea 0.47734768234538494 0.28433461488222855 0.13094076094459028 0.6592216110214756 +vomiting 0.31073013448529085 0.7590412588514502 0.7252421497612677 0.05498355070632721 +fever 0.6656374463255842 0.13283335855808232 0.6175905247564807 0.8104812222027069 +associated 0.6210805540532479 0.28356465641475537 0.19120270660987815 0.7338786355368792 +chill 0.6135088186985103 0.9026428976127897 0.17540703140000558 0.6044708661836592 +temperature 0.4620785663602911 0.023307584453432284 0.1931215351430372 0.3896638745012845 +elevation 0.30724508722517463 0.5557749435047333 0.6099765447560074 0.3440810219955881 +101 0.41729836679855237 0.8925134297484444 0.28081519481751416 0.6209540137864438 +degrees 0.2325021048986302 0.7260139260434095 0.5949696206410098 0.12068200520169325 +fahrenheit 0.46278422539972264 0.5817710110797308 0.5586245993047 0.1774197090890819 +sustained 0.5324667315783693 0.21079262702598167 0.5207444300139926 0.34178150265719986 +low-grade 0.17431143635988933 0.9088075634657105 0.5484530423213122 0.9562254124609342 +100 0.5151970901850474 0.03344142002614925 0.38842114395094973 0.439407082628665 +new 0.5670327863200275 0.4759832935131386 0.4127285484419745 0.1967707158895622 +onset 0.6390514991598399 0.6109672222713723 0.940833792262738 0.8568702528011607 +light-headedness 0.8214163544372369 0.2946437403358704 0.853696925378919 0.09639100329977612 +increased 0.3551118886355855 0.8125184768860376 0.4717659666535272 0.66156789736921 +calf 0.39938104215238734 0.2864864931507607 0.25545134659638447 0.31967393693818336 +burning 0.4355080777362873 0.943692442799958 0.004876161897166109 0.38147782220232074 +urination 0.3045363385932618 0.8220981498319675 0.885105339933859 0.12404786079383845 +depression 0.6665492941468314 0.9418410435445684 0.7837195655130336 0.9130040977199642 +candida 0.4904319372078968 0.7068180302789848 0.21951485464354425 0.07429849055532323 +infection 0.03556888868639718 0.31753145537866145 0.2642722309797195 0.5978290253243863 +cardiomyopathy 0.6707243577677209 0.2740848045511549 0.7139560366470332 0.568669920603166 +cardiovascular 0.7903889016989035 0.6273177977247336 0.05136387127339781 0.8164274840813295 +substernal 0.03516583915790372 0.49642212494089155 0.579164660088061 0.6492065634070894 +dyspnea 3.1035203757834307E-4 0.5129799647967556 0.4704256920569757 0.695900327034857 +exertion 0.6253226049404502 0.27421996583074704 0.3831437928322535 0.8189918506613193 +regular 0.4975357042913191 0.5923861774672741 0.7159592554377225 0.7797826807039054 +rate 0.3739281321859045 0.312467335823462 0.5441456961448006 0.8389404083849923 +rhythm 0.32792964580025263 0.7371352421005938 0.7681330096862997 0.7574836904868887 +murmurs 0.06177811577726289 0.2455929989349046 0.031234290327817327 0.7267994086610244 +rubs 0.09002469616338715 0.5221347303214996 0.5189123193726999 0.601013885896944 +gallops 0.6960730194522828 0.9205705026383878 0.8388344473186412 0.43101590709802506 +sinus 0.7911774455096381 0.8822716997456174 0.7991295642567947 0.7066563879791782 +bradycardia 0.06411474932559413 0.9984433076346664 0.051674360074442593 0.9021259804473295 +cataracts 0.09978202619150756 0.26133164064964876 0.24790034255467508 0.8892381104807611 +catastrophic 0.46626351252015286 0.28442690573196727 0.5665272273926539 0.534322167985928 +antiphospolipid 0.12244706865811572 0.7883224995755815 0.21579213085164328 0.32665740809322497 +cerebrovascular 0.8820612550884129 0.4467435642481632 0.7871584369373699 0.1998553514723317 +accident 0.08504205326286618 0.2663595468034946 0.1431517119545258 0.8023243456895586 +clear 0.07342777910565923 0.03434653116537978 0.858884745891546 0.3489273910216373 +auscultation 0.5465198377536691 0.009267543806286027 0.10499741748036062 0.8431697364016483 +rule 0.378835866060652 0.95368928729508 0.25915126202553496 0.8977307038608952 +out 0.024420966727197957 0.8110566258214589 0.463882712540965 0.6008793087251708 +myocardial 0.20237170759170153 0.8146968063437879 0.8105330151744335 0.8772910958604637 +infarction 0.16096671415845754 0.5834276329469731 0.8221370277594066 0.5740092175527179 +upon 0.45158681423113134 0.8060945059950436 0.43256682505723176 0.06576999554954976 +revealed 0.8086464727377194 0.4464638934921653 0.7148905075389149 0.5186391773658661 +basilar 0.42804766595334476 0.010251426214655357 0.19882032248545978 0.076882482468005 +atelectasis 0.6642061654546216 0.30766490241810285 0.7352885425673534 0.14521945483871634 +pneumonia 0.9244893571458409 0.31769383198490353 0.5156035989263296 0.6270873564201257 +demonstrate 0.3299437625480657 0.21447880174082357 0.03330250960862857 0.44926779791794313 +wide 0.5324795668536202 0.35144547120095837 0.48873063204070766 0.9542051700959497 +mediastinum 0.9312181992820957 0.015489530414925179 0.3795429963460908 0.7735730378207721 +chief 0.414498890849487 0.06135067748458345 0.7947359398091738 0.6101210006170189 +complaint 0.669077814919299 0.11541500667856963 0.23548176296594758 0.9495608971235463 +abdominal 0.04434414205217296 0.8985368094187097 0.6229532010845714 0.10022793462616386 +gun 0.9369148244187472 0.3590726648627829 0.5214884723765861 0.11547903095494283 +shot 0.6697603801733046 0.7301717912748574 0.5048618688790081 0.09226720056267523 +wound 0.8800517867678265 0.3294684339746353 0.02408880583027395 0.05905681211475722 +right-hand 0.5012085229843017 0.9046737012224072 0.09710826835455422 0.1672220477800862 +sickle 0.5271665740243947 0.5951656216182116 0.633849928787909 0.18229673067012564 +cell 0.8806740989185508 0.12752794112369814 0.23460695995643643 0.3349205685404506 +crisis 0.8724618273303556 0.6920896490019269 0.8552996481218075 0.4360532243239055 +diarrhea 0.9344240753563272 0.05786874763944083 0.82012928434107 0.7870046146964075 +degenerative 0.537485441225992 0.874925316662933 0.2802855011686801 0.8220555195344432 +joint 0.4636174638688447 0.7859809885303553 0.36678459386624407 0.6099205340861386 +vascular 0.8915080550755657 0.9232623603926584 0.966180065573724 0.3100303518355121 +necrosis 0.07667634619625763 0.6018788623184899 0.25980115435102735 0.4305882004496733 +phantom 0.7593639662736578 0.537645984444805 0.9103387229873983 0.2675812272920366 +upper 0.817323253203997 0.7151171248191667 0.05401234853258163 0.21740430776080966 +clinical 0.09566173461539884 0.11068827307087759 0.0622160835994241 0.7504802052990951 +impression 0.7921914437347172 0.8359918406866184 0.37345485945364376 0.7092367827627154 +mucocele 0.2542705572239369 0.9083970408046984 0.5645473120221595 0.2510284995813665 +clinically 0.5519140379519739 0.7782863207371663 0.22948480048399145 0.1950657278079323 +radiographically 0.7319888398804848 0.9468781193985307 0.16500076267356256 0.7521422575801259 +biventricular 0.2295985595434945 0.4937188185099106 0.8238182541428155 0.6169279191542026 +heart 0.5153011410078421 0.09956738106194285 0.9897079342038995 0.184819569215793 +clostridium 0.6905928339295776 0.19824813111651995 0.15290238759099073 0.2188205931918057 +difficile 0.40912974278937997 0.22014898019377516 0.12007723307279294 0.6366452216753309 +color 0.1974487595915808 0.5003932829593208 0.9800231971782347 0.21422106146608522 +flow 0.9764870629272878 0.31631568857524794 0.6642974887655444 0.9089891990584367 +doppler 0.34399002632590614 0.6917236981395958 0.9915566534111753 0.19487823714814312 +comments 0.9679352285772944 0.05952790307212008 0.2649378970198415 0.2705552423661429 +developed 0.2623900665704392 0.6096760773881913 0.2037688144395775 0.2535323828493823 +test 0.3697590781177258 0.16592728277032254 0.2340503261012007 0.5049359082500289 +complains 0.35716655947801423 0.7328539904595746 0.9365717315808136 0.04321933379622689 +neck 0.44975868132211705 0.433561646747042 0.7571800888035536 0.5998301067295319 +discomfort 0.06999870682863585 0.41896688410917904 0.9206410949594398 0.2458154988101232 +anteriorly 0.4295678447589132 0.4710769180383241 0.8576920614370597 0.6755794892403331 +diverticuli 0.42927338438819085 0.44556428562717787 0.35430198829654425 0.17247979625160137 +scattered 0.6624046557345745 0.4030327822580734 0.06281763408136776 6.799875962746338E-4 +every 0.38826672308150856 0.2637218051909792 0.9765888126844404 0.46864877521943693 +other 0.44516044964723667 0.08699803269403839 0.9628293808169864 0.19192467406210434 +year 0.20586703732298772 0.6248349479775797 0.47288600915169154 0.146241220724723 +testing 0.8437334999363972 0.6275819946641715 0.10474909067518023 0.9468341760642812 +3 0.3737456921255522 0.49067719773703866 0.7795554386883631 0.4157286197907635 +cards 0.16051333120804268 0.2225582021246315 0.9119961448765989 0.906031061606297 +pedunculated 0.8438816847701217 0.3868784200211145 0.1039568483708132 0.040727317812020325 +mid 0.9229224688107484 0.6262366522575934 0.5713817991426695 0.49321421284890976 +3)e 0.5603468530313249 0.609979920108452 0.7262676473343679 0.18776924131192385 +xternal 0.5219213242764887 0.765240803579551 0.07885591706570583 0.8303895070641967 +stable 0.54925721007016 0.9482224794918471 0.8995083839802871 0.9939067556624599 +cabg 0.8793828585383372 0.29184562094653554 0.8854125850396898 0.07601326200807812 +times 0.07079735524223252 0.8262235495067847 0.4141797681857584 0.32480851155360035 +four 0.7732430576149183 0.11003297455796746 0.8345604618336895 0.7837043849406088 +congenital 0.587223676999113 0.03369973970225981 0.6439180843887414 0.44600212444697285 +nonischemic 0.6993335422988288 0.15942034615827805 0.3327644199697357 0.12918167488810772 +congestive 0.5507528575471892 0.44839114461152907 0.1385746828542056 0.5799200863164199 +conjugate 0.39880257750688564 0.9922197232327209 0.9442646821644082 0.10308244834442204 +gaze 0.2652419691362602 0.4133434006546406 0.08112604481629804 0.571405267999904 +conjunctivae 0.4300306761695496 0.9283821054052891 0.8516794810047356 0.017318841147847097 +mildly 0.3255662469591557 0.6787040540832944 0.17482084641046225 0.1343318973753489 +pale 0.4139471899772519 0.8240845410880628 0.873797825097266 0.9226277853676463 +pink 0.3886008061828835 0.06818925835975076 0.7769699582965004 0.4019988261702888 +bypass 0.8316537614860056 0.7387156624822839 0.9819683924103348 0.784971304509334 +graft 0.27971275317103816 0.28037707640653364 0.3457857239302118 0.7539507247487567 +**date[apr 0.5032615588912548 0.035515914274164895 0.5064907987722581 0.5184429694104739 +complication 0.6886697572522996 0.48498390530754765 0.516676863166535 0.141883995333541 +mediastinitis 0.12684775819040706 0.7010289731169536 0.4744157614343787 0.22233707617977005 +resulting 0.9588712574232701 0.6242797848072935 0.3744052472942785 0.387293169667113 +amputation 0.5064381120866168 0.918520337175421 0.7701598617693106 0.22003658971687734 +above 0.7238088397270572 0.5797993381567995 0.7375106016127206 0.17351455921761239 +elbow 0.34792165895013416 0.1805194263621015 0.06034192620220735 0.7473762428023564 +cranial 0.3946617585301756 0.6145042035105018 0.5214096871239732 0.7048227823721599 +nerves 0.7518036288788205 0.4756415376199029 0.6742464686750798 0.3881369460539079 +xii 0.7529340554827871 0.7505536075651429 0.4965759205938388 0.25523716769188587 +pelvis 0.781703218572338 0.3348459139240294 0.8813210423786347 0.6043278167819254 +**date[mar 0.15203928663775657 0.9928387235780839 0.00820581319170166 0.07969347140642757 +24 0.7375020610936156 0.772285861575901 0.25711342833940065 0.9589120490957255 +interval 0.0817394083710895 0.5584378603368716 0.47377861164527935 0.4102379156728636 +development 0.012049905769902836 0.4059076140462665 0.07905520312280445 0.8786328513371103 +marked 0.09934382512864903 0.5221480720633925 0.7249150974376776 0.5098991720413546 +intrahepatic 0.6603336993854679 0.29295392909426243 0.9434973822719686 0.054595799852489524 +common 0.38157499317952526 0.5382496323274247 0.5099420248401259 0.006895050783298329 +bile 0.06001618903747752 0.26552867783535417 0.04540513622830933 0.8282637532238291 +duct 0.4010067398631477 0.5486598783743352 0.7235683544735112 0.9818297365911061 +currently 0.2922505432809174 0.8013787030575574 0.8121172828042854 0.16052053440862668 +some 0.48098153604832095 0.6376322803458653 0.18007968123570028 0.5305380854362479 +cramping 0.12167695547190227 0.7150493321187489 0.8325768749515434 0.03145988998762661 +much 0.2329132016662392 0.4431572045458483 0.9964566333081591 0.7752331434696402 +improved 0.7045078227927808 0.20456956095413847 0.2734394298104167 0.36855562849667034 +disc 0.12988091695721204 0.7852207244187036 0.9018243190816926 0.5466130376725735 +advanced 0.4036073233073494 0.03882324237265111 0.22346472290325348 0.8508694395908158 +facet 0.7204211012495423 0.4210773895990054 0.039638174031719875 0.20958379402302452 +arthrosis 0.33947200688132806 0.9207627496567065 0.657598551896498 0.18695254188628152 +appears 0.23642323307596613 0.08769355256912936 0.07901306831607169 0.3552539423129103 +most 0.5560185426913739 0.31162097925756604 0.9621506396631605 0.7140974600626713 +pronounced 0.5990360020827398 0.24151727504198628 0.7016017945355246 0.3318573367212547 +c7-t1 0.9300847399688852 0.25127815615015603 0.21247517792814996 0.41659605671929356 +delirium 0.0764898888885277 0.1557311826378467 0.7804288949098088 0.8372606547758547 +due 0.20355031997800221 0.9599398793342411 0.6445596969403621 0.71066032073371 +decondition 0.09774252609527267 0.2851444911379398 0.8170865210040084 0.045711385410228456 +denied 0.766983369609538 0.2562821483395403 0.011929336148706282 0.5936617321425197 +constipation 0.9758530632680711 0.05888040152048024 0.45369826290634907 0.9914829550129463 +use 0.07371924516289197 0.07722014349778639 0.7692718511486758 0.005346144876769987 +myalgias 0.5449918071298495 0.5110151392158679 0.12605733551849063 0.18207286923135135 +arthralgias 0.8132774469157464 0.6344567914119619 0.27419042442606034 0.49867956778774614 +denies 0.9649913523372851 0.409325050853552 0.17434523022266812 0.6636106758889652 +dysuria 0.22302353852740076 0.9929671358770764 0.44848993553937655 0.5679818374495926 +frequency 0.19734537722621648 0.7922976539576945 0.9778373164443498 0.011131261700135009 +diaphoresis 0.16698160952380414 0.10657532799367753 0.5904033186833565 0.23200403256109947 +headache 0.9065616998725997 0.0745631795239946 0.9300014641218536 0.9663704148010873 +stridor 0.8607040792386601 0.5957799055511735 0.2791639985511971 0.7196060348372352 +air 0.790775180261388 0.13090289781558073 0.32039591083126695 0.6957383818477832 +hunger 0.37110473533943167 0.8029838699483558 0.563330831272236 0.3556903022795995 +diabetes 0.5955275779994512 0.018990119109818737 0.4397728394588841 0.8911281654352249 +digital 0.2701615594434641 0.9102859981379996 0.8742717425084318 0.9950485204866986 +perianal 0.3146198247050437 0.25102059931819953 0.47479358469570243 0.18621420863222105 +skin 0.07139527020595704 0.15498923476126003 0.35621227341947637 0.6017766987043142 +tags 0.3319434554314876 0.7743881155512841 0.00876212051070513 0.6108984284530985 +diffuse 0.3280300693297743 0.4279826541128028 0.5542553383386273 0.8219157860588034 +crampy 0.5081972468534464 0.07156191919912747 0.1947826053164422 0.05655910349933835 +splinted 0.5217428114514706 0.7901532617854845 0.6517784272208296 0.6469733523218596 +instructions 0.46639536698920137 0.03932200296251298 0.9177478381493115 0.9747595105373951 +22 0.1209067331762329 0.6919838797663589 0.6986144449465046 0.515982954164152 +instructed 0.5800733449576265 0.5575084051176795 0.5115641120054093 0.7820721505218058 +close 0.4372360590736628 0.9314931836671496 0.5214226068070927 0.4537573049410829 +followup 0.4180979196741149 0.9074412399585136 0.8480195784246738 0.6936749894432717 +**name[uuu 0.15973777636929343 0.6356742923231586 0.25765544069469404 0.09299172740704897 +ccc] 0.6499872175451831 0.450167014597217 0.8637014977054804 0.9583758897869429 +regarding 0.6948768141873776 0.8005883137198764 0.7045281588597009 0.6778736423229935 +tcu 0.6656474447258764 0.2246881520354178 0.841369421787188 0.7177215760242035 +orders 0.691860732815326 0.1752687184900411 0.9243969489476696 0.39685492253092713 +gi 0.41343231303170547 0.3763315414570585 0.012189944953115539 0.9618471981644807 +disposition 0.3819504196156558 0.5245930417979611 0.7979519926939059 0.6422429623177495 +19 0.2104314223221353 0.5754124768614838 0.295239629807194 0.304723052857209 +afebrile 0.6107028747040834 0.5562783138450713 0.4666316949317859 0.044266881860118734 +hemodynamically 0.5288940345142 0.836350319078122 0.24068225776683205 0.5514543547071217 +being 0.6920523520516241 0.04715595240373294 0.8556645198891563 0.3772751114534709 +**place 0.012694156569456161 0.3973920262143471 0.49302657194098665 0.2802906396447422 +transitional 0.46313609559152713 0.9538829720986675 0.7191443586352897 0.6857204698431322 +care 0.22774355387736256 0.5710999248308165 0.34087095220862074 0.9120006706635057 +unit 0.16877106264877573 0.5432453108982571 0.8985458321033197 0.22758919742746408 +dissociative 0.9002586784335292 0.725759770038325 0.1695410190031844 0.9892695246229299 +give 0.0544877327119514 0.8272097783578155 0.7360859676406917 0.08293003629924212 +deafness 0.6467878575644359 0.8078655189645343 0.8830266428712055 0.6387738719902991 +dyslipidemia 0.7970271479960321 0.8378807166497045 0.834338064916688 0.5251397061364994 +decreased 0.44990498836738657 0.06583705204913148 0.6116244829704945 0.4385915843278956 +ed 0.15120787241178968 0.5275178267683364 0.6035044052064018 0.5026608902607775 +because 0.3834760016609141 0.8851270207466195 0.2716924645208274 0.16360221038946388 +chronicity 0.44206859070726334 0.3287711093839051 0.28775397005750314 0.8287491330280791 +nature 0.3414043629661414 0.8948730933481934 0.7939220427011264 0.13153444818236382 +reassured 0.7811928928359427 0.6602893950846285 0.4120311155540175 0.9269648150245307 +interpretation 0.7737617029509611 0.7562518594582923 0.08653510259025798 0.12231659506555048 +conduction 0.1361225241106827 0.9128588381644402 0.8407419567793272 0.23004651037450174 +st 0.684210120529022 0.29269073189250694 0.049811186183277734 0.09338682594615699 +elevations 0.6317651023055981 0.7012271259895929 0.8897241926257773 0.5798218851826067 +leads 0.39262209458319075 0.8720938308609508 0.2083517247168848 0.16361888676404535 +v2 0.7127029473434121 0.4167702623888525 0.18290710320135584 0.22581252337372337 +v3 0.816804525153736 0.2012384797764173 0.7774871237981094 0.5885199713707151 +v4 0.045308793762506294 0.36615765474900097 0.19506754683837724 0.2701886973914237 +well 0.277232862412646 0.9424982006938828 0.23023337211941564 0.6993566744657397 +inversions 0.28857497944708255 0.031172266402782922 0.6720885920970914 0.6567306860857876 +avl 0.07870195602303343 0.8130486971596484 0.6812041181117914 0.028865513025637557 +radiographs 0.11256240980590815 0.2348414522323018 0.3701265745162433 0.05145257529460723 +x-rays 0.37279945654517166 0.0767130626539666 0.6538845351417174 0.09620412495518582 +do 0.09337510857208864 0.2490353806360317 0.8050933848668607 0.21254482307398948 +confirm 0.05510834933404962 0.588602173414224 0.047673329385522156 0.5737994511315692 +viral 0.363567153598243 0.12868883114580842 0.9538597712107137 0.6972921881840357 +felt 0.701044668656826 0.8027070328137491 0.3710989800190162 0.8087491660335621 +sciatica 0.14688784607242755 0.8078028163565452 0.09858609644127425 0.003622715189738779 +offered 0.7942368011117201 0.40259879511531615 0.5576166478122968 0.9214324917627862 +glucagon 0.5562715762530558 0.7423822868416153 0.2555868476986797 0.4939445014426487 +declined 0.7513524335272967 0.46539307931222373 0.9303917896135553 0.9539868618149153 +states 0.28397855452312637 0.13546597283587725 0.8801350785662094 0.6989262000737659 +severely 0.497279591676587 0.31913462125098413 0.13247698186700219 0.7913064874514827 +nauseated 0.06279268459749154 0.029405031141215288 0.7886129686810693 0.848877753295215 +last 0.5945678771390668 0.5037586437805015 0.08878781928538715 0.8272191365679731 +hand 0.02718268062825413 0.8005461263534781 0.48941492753615135 0.7303872877894174 +demonstrating 0.1810362666885481 0.4957850899793257 0.31628466749371387 0.5984973790745282 +phalanx 0.35333308502163396 0.8414758905371704 0.3640092706433715 0.21587085274783058 +fractures 0.2194536187929076 0.10690204705179396 0.14876606870939924 0.859360616135528 +third 0.8069508938306206 0.4659823829961993 0.38956499169280123 0.06369068646881049 +fourth 0.8131981696473208 0.7165945580987123 0.6926801843947068 0.8191240355256706 +digit 0.07775451359197971 0.32189029310135453 0.34454960672413215 0.8364989528469886 +obvious 0.5918023346902818 0.7489750971993226 0.5803607072460533 0.5156508029982296 +12-lead 0.02955094273644443 0.8469258132966281 0.6887044076801718 0.03546249210268759 +shows 0.4202889862653648 0.05397198696659711 0.8370931570694409 0.5472636840771772 +73 0.244807640670337 0.5527675899689269 0.5588036005478128 0.7980253408192509 +qrs 0.9362055010262689 0.10303195205621729 0.42841807486300276 0.6357725030263995 +106 0.49262534273590264 0.17243995497476494 0.5716031215637879 0.7568651691338415 +milliseconds 0.6682617700998419 0.1594930505383757 0.30407743943784316 0.7432734024396934 +qtc 0.9990179041689903 0.9562046443428123 0.3753858972632984 0.9076995035304911 +467 0.7404785028575281 0.8337843181737916 0.005010023686039311 0.715376466666965 +endocrine 0.6484055728223928 0.6511074528150478 0.14150934929940717 0.5835987697000473 +we 0.4157520396616794 0.6836082855466893 0.1498027077309816 0.6735178289825102 +continued 0.495227885532191 0.9493356507657831 0.4373616937444459 0.8834614502210951 +cortef 0.9827419075796231 0.7692627105467126 0.3670034139148517 0.62901234142306 +polyuria 0.21458238672973806 0.4405430368443968 0.1721331859786197 0.15230844368056307 +polyphagia 0.7399873269813488 0.32425761909729856 0.4897021071637143 0.04150335188456311 +ent 0.37079566965043165 0.35329816230840394 0.5568084957986045 0.36105232977002255 +ear 0.08115902967357724 0.9513841679858366 0.07878612918204675 0.29446571955945544 +nose 0.6328324842564953 0.6208126972502495 0.26978653521467777 0.48339806818284836 +throat 0.9353526486401352 0.004391191725139798 0.5912761833573122 0.310418188620293 +erosive 0.3154497325998834 0.05610545233270592 0.26953409493253033 0.21753130523482478 +periumbilical 0.06632967496295317 0.06542613356606719 0.7601489631549216 0.7785016461122689 +area 0.8750012820070425 0.6717590932914903 0.9456792022063656 0.650250346646787 +extraocular 0.9846126368743793 0.6783962449425666 0.0020530910614743325 0.717079152992495 +motions 0.23879981331461775 0.4294250394682402 0.5499509062367257 0.7111985888100744 +movements 0.476340295349452 0.7484065075998173 0.9193240702839817 0.3440143264326312 +clubbing 0.0969421486278983 0.9542333079609805 0.09837132184326058 0.5148528932786759 +cyanosis 0.00278959314520999 0.26743046816054683 0.2127051174286393 0.4772842081919021 +2+ 0.5355041789735413 0.9992635024603777 0.18610646622252713 0.9379248532710405 ++2 0.4744701591462628 0.276370119005234 0.5711795113116261 0.9385828320986366 +pitting 0.6400116500538546 0.6952327621351769 0.4063819951925177 0.029830787102717693 +healing 0.15369794846150908 0.22366141010204543 0.35532130377375537 0.682571677601433 +peripheral 0.9480930978375313 0.8705575774402772 0.3148701874694888 0.10388680383577431 +eyes 0.689554006395739 0.1305812499907364 0.14057007980551794 0.6244334682972459 +perrla 0.8802211996446957 0.976945295661418 0.1309542163235632 0.6068852901562143 +eomi 0.5391349143892685 0.22341239429184578 0.43683645639215285 0.3883854055399484 +fecal 0.5274486312954799 0.4056714854624557 0.6172763328301605 0.11470896259966923 +occult 0.2790007942906303 0.9426226089366404 0.1508443426614109 0.44980806506866755 +negative 0.42504495675030973 0.2296998251648099 0.8289758734038587 0.05163878198252514 +impressions 0.5607274514072125 0.4892142606199781 0.607616300566358 0.393646382035576 +01) 0.19094057375589357 0.656800393452414 0.6740234103579344 0.9810118816093315 +cardiomegaly 0.714468733864546 0.3184003863954181 0.19750809574111083 0.2617386916021128 +bibasilar 0.15788392155012954 0.7288228409672359 0.47605738285928334 0.5648859774770136 +pleural 0.25443539814648874 0.24076389410364873 0.9651989789275257 0.21173046655015637 +effusions 0.3782186372415949 0.05228038854392847 0.9811295344725449 0.5403663598926685 +hypopharynx 0.7901966701057143 0.04105569968439915 0.7070437652589934 0.9602285328126073 +larynx 0.6301862485186446 0.5954014360224807 0.49976508488120164 0.3102515100423716 +portions 0.09771091797662579 0.7747885350192935 0.39319276751162036 0.7328524714251285 +globes 0.3046611116555261 0.8368043623089789 0.12574431215676618 0.9956541636340944 +intracranial 0.3539915000202326 0.1527821503994291 0.9808863882164292 0.6938390548587269 +hemorrhage 0.30122389027569174 0.8009524570855413 0.22301130180047024 0.04825345537022485 +can 0.7893141362778824 0.6738166229133911 0.9353299722574419 0.07785635917969724 +identified 0.3124793972206257 0.765544497784311 0.6221167698965694 0.7698347801082762 +detected 0.2608720831302265 0.41554972581748684 0.02455562799173161 0.49668339254603344 +(see 0.12552548986513978 0.6759979941829575 0.21760041112787187 0.33677345895069455 +image1) 0.5017851695656665 0.6438547647101076 0.20467716240868927 0.14341187403624678 +esophagogastric-junction 0.8194791019977735 0.5521807110941176 0.9077197460478321 0.17711854064854438 +vascularity 0.04936926442730394 0.6507349473307092 0.661944130559434 0.86682607037492 +ileal 0.9879654693079035 0.4225470519219726 0.06793123873478069 0.7979328936152078 +mucosa 0.1065721366850918 0.4809709535756399 0.32800167376483713 0.33861727239687534 +aorta 0.36547726489754895 0.5708355021542589 0.9995302494407013 0.4105460008912637 +caliber 0.8404821364411795 0.45765725846124616 0.4347620350743787 0.7860888411660263 +radiographic 0.4865093665833494 0.13048341008910413 0.04485573492115569 0.9513753823886847 +shoulder 0.87181577047386 0.6667841950014511 0.925294225318631 0.013008368337885878 +may 0.2563758457468337 0.31191508779814925 0.31046635791631527 0.3459734665374502 +tiny 0.9592906372810116 0.8110215555993953 0.7871378995909408 0.6812628881323861 +unfortunately 0.5626154284896765 0.7978599492096047 0.18969385141447637 0.11395544078194175 +nodular 0.13278986248366875 0.09286010052214355 0.6244479547540084 0.07597378236352537 +enhancement 0.3058914302561957 0.0918565565539895 0.12598015760698433 0.6624649254019602 +along 0.7792242536814572 0.0372671967411452 0.035375736443581385 0.8267272041337911 +pontine 0.4792926009443017 0.8550989865823998 0.30840462675020064 0.8658334694923108 +medullary 0.434998427379234 0.4140129929603513 0.6110935524294769 0.30757657968348795 +junction 0.17494123184443922 0.5167816187680527 0.2408070731371189 0.7790900504741197 +since 0.2889872388353766 0.6698230486408715 0.6228683137509228 0.6884732896744009 +**date[aug 0.6468144622388339 0.6930989878689499 0.3759136961005827 0.5829770304572467 +up 0.6499287301981614 0.530514113416721 0.713590799320757 0.8607819031334084 +____________ 0.8086853530674196 0.20005630387350914 0.25587132797458356 0.10812862416708602 +advised 0.8821397713184808 0.6832688629518272 0.16016677519740952 0.7095359982802487 +pcp 0.7960897176468261 0.6952475573784881 0.8505968455722396 0.5360682796010561 +symptoms 0.563460735265209 0.69920617587414 0.5231761736465597 0.45509111482841 +him 0.47122523959589635 0.23375702948788613 0.035347721196992254 0.12468109318329645 +general 0.8693354946901778 0.6064029739038509 0.7502064961361702 0.03683077512579891 +floor 0.9996887710965676 0.2733405654886052 0.39891993134743386 0.05581765313229148 +primarily 0.43693279049642597 0.7735240533493746 0.04926689125595063 0.7615820665700931 +dehydration 0.8779584235999727 0.9140429603827758 0.8238796983212031 0.752685425359358 +colorectal 0.21321544522119906 0.014216056747966777 0.09868906425436896 0.6893969834357546 +cancer 0.040526515469946234 0.4074075696633713 0.5302350072068777 0.2646839042493627 +poloyps 0.04086356944491243 0.07242181165895512 0.4099210892046561 0.4793555958645148 +gait 0.92558953418838 0.22043456046824172 0.5239535150303355 0.5673401567316696 +steady 0.9859579877683937 0.8987713567498132 0.909620720077175 0.6680476092347573 +assistance 0.6282016022977025 0.18944365572886668 0.6417327125881236 0.8750003039898462 +gastroesophageal 0.6613302040329708 0.37820593433711613 0.05745420467437523 0.603388314606384 +reflux 0.5197974929462201 0.5489983750448301 0.6334288886790833 0.31393388959309687 +alert 0.04948513977630287 0.4189910385546105 0.06600687733073907 0.5064252028839473 +oriented 0.9629916320575308 0.8710471555137358 0.9735510031359295 0.5535070434763992 +awake 0.5790027644328444 0.5632749100691459 0.039211839807742455 0.3837916829788903 +apparent 5.590361321975434E-4 0.1994381699830523 0.3432256830919057 0.24308066784599536 +lying 0.4583251418444818 0.5293487825835376 0.09570298393449839 0.7634639307445151 +sullen 0.1475048685720557 0.9149639933634078 0.9740398586661462 0.3522900387249527 +facies 0.5208348402268816 0.3457648622651768 0.7934985387642329 0.4764772419092993 +lucid 0.6651284368754695 0.11609216706002146 0.2951668874244171 0.7623620119625422 +amount 0.08513123760766694 0.987468695516743 0.42242152245713305 0.9572581020466931 +holding 0.9459880975270731 0.5505381027344365 0.5751826465459644 0.7061259219483453 +wrist 0.584410602826738 0.6741016603586766 0.9806631102963629 0.7852282052459872 +pleasant 0.8886734497238592 0.3828366064212382 0.499977985945525 0.9043748908460599 +genitourinary 0.9688567448851746 0.24063042310428262 0.9018954704111404 0.44033674628109276 +attending 0.39129066636295917 0.3330680395288299 0.8781733730807175 0.49623315230017573 +physician 0.1349478499627137 0.24045908814654104 0.8493400562619297 0.4951771359854351 +**name[zzz] 0.34256766380794335 0.971895628716947 0.915442428167671 0.11118617448571733 +shown 0.8855863212419836 0.6911805472613687 0.14319778080177126 0.7163445465861928 +ml 0.7394849365072452 0.2070699933340473 0.7491503071949808 0.6804295664338353 +fluid 0.17495745692632314 0.6231233895457582 0.6378472327321595 0.7976901610345978 +balloon 0.8103388250367611 0.24655729064854848 0.1847133557873677 0.8351930980180259 +urethra 0.6420235208190493 0.840850185491545 0.11521723310761722 0.6981290024721079 +gentamycin 0.27214282757339403 0.5734469954029863 0.8250859987597076 0.9896960932052424 +80 0.686559152609708 0.5577640204032874 0.5107398852567985 0.8066570167034913 +ivover 0.10127376062449467 0.7177959150907417 0.03478166839242036 0.4563902661863586 +30 0.591634634982269 0.8139788596930132 0.538290542934139 0.4072338740347009 +minutes 0.4462687291972578 0.8605643010222171 0.4298333115857921 0.19114668467334017 +& 0.44645327016460656 0.8376106138097362 0.5305207677134985 0.16618781154709006 +vancomycin 0.5616791981429642 0.5548006044260451 0.835100331078281 0.01604256489913658 +gm 0.8734487050313887 0.7596446566998518 0.560990663284911 0.6147987511978488 +iv 0.8918885346420602 0.09923105802717658 0.9003278510636703 0.41740460495939014 +over 0.8135490906790698 0.3425363153790252 0.16450668001762936 0.849715679542087 +mvp 0.6837990676958053 0.3494100884550482 0.8163647675113475 0.266764702593005 +regurg 0.8735137889252391 0.6327005986017419 0.2220713743081294 0.92386500875418 +ptc 0.757295101645705 0.0015883285195426033 0.8229466299867649 0.06238481619474079 +drain 0.9892491465725706 0.6970471979213989 0.8268511043175484 0.6505057929850949 +25 0.3123880115428459 0.634453810024847 0.4494497482211365 0.5390066437935156 +bilirubin 0.40900956397968646 0.853717260095101 0.19756810936432112 0.793088035445576 +trended 0.9019497903331247 0.0439032328187946 0.6800228906023895 0.743482781932037 +down 0.25038996899724897 0.057178031181266076 0.9557231855444889 0.8635707574640293 +approximately 0.9489411086252637 0.6559991664192764 0.4845023643394001 0.9239455234691858 +11 0.13501613887842878 0.3478316254150474 0.33305505466815366 0.001855521016600803 +episodes 0.4370403687663117 0.053269401912472425 0.40651734366043046 0.8301693825460638 +ng 0.6548389737714848 0.5262942948811564 0.12564294455835823 0.6589920387253876 +lavage 0.653579821442105 0.8995051523800992 0.1501912581495377 0.27278379630977334 +only 0.673939022842572 0.4625022668045784 0.46105841074564025 0.05681213651698169 +50 0.11369509986751691 0.7982136415475867 0.35684551898600547 0.03899215869106576 +cc 0.71974759760451 0.6998005021270243 0.6949727462411862 0.6466478312193539 +coffee 0.0445942919380341 0.8698821141865086 0.013112192402692124 0.5124534644665742 +grounds 0.9675904978281482 0.2641706880587713 0.0036261211805421745 0.03979055569908163 +bright 0.05885307677084217 0.08320887501840613 0.18186135423596472 0.5949827561546679 +red 0.5724608410742513 0.8387545426022514 0.4885649405818391 0.13234700324428328 +ros 0.8956417852366385 0.6254579053655874 0.9973498019383344 0.7777267488331934 +-ve 0.3519271329812518 0.7138289001992336 0.8600534800541026 0.3971484822342324 +change 0.35879093326209555 0.8470933721012025 0.11599717830143141 0.45947068386710865 +habit 0.07408282347763162 0.1723885745415744 0.015565937522318163 0.5653694688678513 +(occult 0.8306014227803201 0.5658500789552745 0.06419563936804973 0.8597329281953051 +nor 0.8544436276153116 0.22149344534563753 0.07976545999371909 0.1705121223321866 +overt) 0.9903055642284605 0.7653961523772439 0.17455976898141268 0.12430327330707702 +alarm 0.4156987891375866 0.8295359662960354 0.7223762461790674 0.6774126215464215 +symptom 0.6160852123273536 0.9737444962994489 0.05202234345916723 0.4708814431069527 +sign 0.21422276664365936 0.07958535469427885 0.8203548593103013 0.1689258121910432 +concerned 0.5995533553068798 0.856568689603906 0.7611828202036097 0.47247189145821633 +possible 0.6872532000883463 0.898964578061599 0.7807671117386381 0.8129442243233996 +show 0.5516052332450805 0.8105802108141017 0.17631429529668596 0.82619119743816 +risk 0.12464180884236187 0.27521664410068925 0.09077691642482855 0.5824079567398402 +factors 0.6285454672457363 0.5921228351775166 0.1854355580221052 0.20514438496455345 +including 0.9190842355139471 0.11861621240808284 0.9111530422225572 0.95096139152813 +tobacco 0.4573971633637537 0.26614048516356126 0.6898542779357562 0.041026199154888254 +age 0.19873501737067178 0.18138289278088326 0.7797002202207501 0.5369131977174287 +perform 0.28929952515712 0.010675027743156873 0.5905876022494764 0.3351038821447241 +few 0.09611440736733168 0.6937524198729156 0.7410209031242309 0.9181558333386823 +atypical 0.7598999909288405 0.5620500238031616 0.2030665493266497 0.845237383869143 +circulating 0.15721244430780823 0.6270620913957581 0.46901373005771674 0.09070074403239992 +cells 0.8856473922847146 0.3358967069950334 0.7915301556797201 0.24659363249002753 +(some 0.6446975380391962 0.7440064584540474 0.19851508072898383 0.29348546006608045 +monocytoid 0.17489202341049237 0.5091786667575117 0.1645160677418982 0.1137106882144594 +appearance) 0.7391349468543232 0.846219856376191 0.5324900619088325 0.17210911027322917 +marginal 0.9826993486622702 0.5983710732284522 0.522853349538636 0.8215683977363734 +lymphoma 0.8921219138120694 0.8793416230688492 0.44117891528291286 0.5041521300931349 +favored 0.48828308141147225 0.3189511497535945 0.0015640857724367185 0.7133214997066343 +artificially 0.8352455618579256 0.8057207199161923 0.019605211984783932 0.9587369876971509 +able 0.9239643840900182 0.7613558722684304 0.5970799314261837 0.13235176121076786 +troponin 0.9987696811548504 0.40491560970287355 0.8198523816588088 0.6129856830289097 +possibly 0.9345630192345726 0.16856314677553375 0.4211798223201093 0.16629508820893335 +old 0.34395696299227985 0.4121458011079193 0.8245141888536216 0.1980304637646041 +infarct 0.19612691090283307 0.8504528092719716 0.4730551243874571 0.20069995894936965 +great 0.8629325529130284 0.6548513985362179 0.37772609947553737 0.2972342059247599 +toe 0.6729064454496516 0.020103632293929863 0.3709085329615024 0.6299735777953194 +laceration 0.24999539546955074 0.09751668685519566 0.6401832618613524 0.9692762996466047 +gu 0.1414795314516919 0.587182734587238 0.7344927986733132 0.3232717781016977 +urgency 0.4982837126358771 0.954914216463233 0.3845515758133279 0.5187606294094883 +antecedent 0.860933770120159 0.19358339180641282 0.15999579241859063 0.6929890194274448 +palpitations 0.42570054439431193 0.2839255449172793 0.1976782223563136 0.3870017350916216 +lightheadedness 0.5180121955745582 0.026467592596838063 0.9786778295307778 0.4635048131936347 +headaches 0.38400598409584874 0.714094765270092 0.4060075389639417 0.38564543767678205 +orthopnea 0.8261283477211535 0.1938698669539265 0.24066795188329237 0.4878426911052799 +sputum 0.6549819849593287 0.22707144673113633 0.29060893338881 0.7457802767595743 +production 0.4071575170886922 0.6980181426180105 0.6153247825842658 0.20989796849051767 +paresthesias 0.28757630325307004 0.14509870801963953 0.4082227133930051 0.7051957530952524 +left-sided 0.8349099616179697 0.29152825325635756 0.14956897236161193 0.6198907372500849 +seemed 0.37993165568967113 0.7387031309802201 0.08473026474356382 0.6064439543106886 +resolved 0.9691924802723844 0.7516282639729894 0.07559275544425881 0.8007001096027669 +reached 0.06020635110391037 0.11374249559990879 0.397454804946511 0.7283605644730771 +complaints 0.9593647022596007 0.3575687387087999 0.10935341797052955 0.41273602065985704 +started 0.6700926900112839 0.28796773852317725 0.21269901842420358 0.3237016642696028 +coumadin 0.5462962010599888 0.6063753758429732 0.9312565767091168 0.7150050505985694 +will 0.9049394293700062 0.6172552726659828 0.09492561434510105 0.5855523597475679 +once 0.541630784601379 0.4521001142064537 0.7904356911440262 0.9700261497843469 +0 0.91438443478066 0.9816660327725423 0.1539874033833385 0.5151638326327205 +complain 0.6132640782263789 0.6037270985655675 0.4240341164908652 0.33025282446955173 +pe's 0.6174037801934205 0.39148725785597427 0.14681264404457794 0.9138747533177294 +dissection 0.7559408635505 0.6668895993359661 0.7328956215069539 0.5536578973864847 +infectious 0.7356837973366479 0.9234137255438098 0.3999525454081563 0.9717733589659083 +etiologies 0.07587814829631334 0.16455423833208838 0.9050700586422443 0.14613711028957221 +good 0.2992218506490355 0.1470240990922228 0.3693965949656206 0.17575668171451497 +entry 0.7683205270594237 0.7218288760826331 0.9285401476464276 0.7783878094245387 +bases 0.6280627050075571 0.4517091393428838 0.9040623078330259 0.5096427749373128 +occasionally 0.41693372613509927 0.9500499686248554 0.6845756503972826 0.07491897352882548 +loose 0.04407603106540181 0.9342077180577703 0.5293655056013399 0.16899259391407484 +stools 0.9939430041200559 0.815306397508688 0.0631213823533846 0.12284895200094004 +fevers 0.8166819974485262 0.7569070347285488 0.3172774305296875 0.26329758003239323 +chills 0.18992220456067999 0.526555200943895 0.7916249733177778 0.29814682266571146 +sick 0.09643063618604497 0.1370632936708276 0.1592748704010052 0.2281890311747673 +contacts 0.9521401547139365 0.2284860235567403 0.9187109197732787 0.25697408695042157 +antibiotics 0.8403414910388327 0.3770988188168021 0.508589771108191 0.9203259705180084 +drinks 0.3603702588463086 0.03429034598872971 0.14410719165330166 0.18978261934937157 +cholecystitis 0.9829099578078765 0.5961324665136183 0.9081641174095495 0.6176173058252895 +peptic 0.21906976407017476 0.18349172320804463 0.43623024290487833 0.685825045099467 +ulcer 0.35668611848159715 0.07289882945324033 0.8707169687035491 0.39743474558871916 +1+ 0.994835091035359 0.3599932790307372 0.6288353220071127 0.8057343644646338 +bilaterally 0.1163327986786411 0.3602129802824362 0.9273292737995003 0.6154727036870227 +cholecystectomy 0.9780389400654554 0.4554987676786939 0.9381376263279397 0.20982820033952632 +cataract 0.6700686725971049 0.5811370005273077 0.9109532248743002 0.4579114428493718 +surgery 0.7975561387513669 0.313463427519695 0.5918111294272037 0.4397948377476252 +having 0.37240307787146854 0.5670042312676962 0.7912520055598516 0.7895396215347235 +oozing 0.6674838745364385 0.2387309005636743 0.01127103497085602 0.7014164393947685 +range 0.4820684763069084 0.747457781768532 0.6271764474821421 0.7704927105845418 +digits 0.9947628983260121 0.3080123012994742 0.2386461424719064 0.7459865238968268 +got 0.13905682801149422 0.9526594485702944 0.4536847258680724 0.9642631983891078 +capillary 0.2266058865481645 0.766364013920006 0.5498474287002145 0.6980772026012755 +refill 0.36466646309645256 0.5378252161672047 0.16901143717766853 0.9096391030793539 +sensation 0.06663904850956015 0.04482011525559526 0.44513913742733735 0.9273129114440848 +anxiety 0.3384652490784241 0.49024382705144387 0.7388921223225384 0.3792270010570702 +progressive 0.4386635613539497 0.1843359354081966 0.4586183609735527 0.3118068266656785 +several 0.45607655619751086 0.4289343165805338 0.284149660384595 0.7972157700004556 +days 0.8269427481038899 0.7011988098684504 0.47800981600520676 0.4909158738129351 +alternated 0.9216942732704568 0.6570991122048866 0.8680487975615847 0.5390371737014729 +developing 0.8719213501252226 0.17906248295420146 0.5161816107849934 0.9549711372491706 +uri 0.8792330708189524 0.6775287042713664 0.04982491587847493 0.08557983704926841 +rhinorrhea 0.35028336828842876 0.12276114929722703 0.125788594564601 0.06963939333456348 +off 0.33602012683950566 0.8311142867741929 0.5207933287429823 0.25461261185624817 +regions 0.9182542355705291 0.8590597347778252 0.8984492874880133 0.38257232862980006 +still 0.38293460881241614 0.9686245019289024 0.38944551335809974 0.9820736657742303 +uses 0.24705422005474253 0.5194962322706578 0.7398628550350285 0.8099996207708186 +advil 0.4649012688773275 0.31590261988546897 0.931938321200745 0.3758815706687487 +relief 0.42940869441170326 0.3904727010520792 0.8622721030215407 0.5509220633878177 +specific 0.7056441253979274 0.9426403992150109 0.8743825691288072 0.014924581137898851 +never 0.7860218206421594 0.16575112521195556 0.6856792179564721 0.8265610244751191 +sought 0.5406066847008196 0.4948848775297977 0.5233370448751336 0.7158844167967001 +nitroglycerine 0.45091999397010984 0.8865100619173707 0.08921984879228506 0.792720695256531 +paramedics 0.16194513069655359 0.1014341665076317 0.3109145753598934 0.24055073156278683 +brought 0.3645710656750818 0.6041188634670925 0.7041023368253512 0.2227188899615774 +into 0.5295001538568721 0.6142788806599261 0.5709059457107007 0.6662826357184667 +coworkers 0.553685666400763 0.05078520281851817 0.18951812936736334 0.8085430994264047 +prescription 0.9516543315439031 0.5932451027512021 0.6123282354655468 0.3922733124239277 +zofran 0.8266070297083004 0.8532325896684244 0.6829296102493597 0.17788796871665957 +case 0.48998096213335374 0.6949951948343628 0.4068037547286951 0.004274206215338694 +returns 0.2631952425307096 0.07456364402214466 0.8661415243405762 0.9747302385844766 +return 0.6915904379129859 0.3765430209111351 0.45617560286220704 0.47043490711901437 +worsening 0.12811267247399405 0.11946600059701418 0.3385624899140207 0.6010975966283532 +febrile 0.40308091601520124 0.07343919727804615 0.656766606177956 0.39091500674971325 +somewhat 0.08641617824645154 0.2879164396313514 0.23906267918343427 0.4676875301221143 +confused 0.6699990817923712 0.26278339105274007 0.22668729213435013 0.2739241915852677 +readmitteed 0.8503630245251225 0.5125400652907516 0.3078337957650855 0.8509927530416143 +07] 0.975717428617612 0.07121926058925976 0.012599544743910673 0.060846738026909986 +leg 0.6363361894972432 0.06478019429818715 0.8296579434791238 0.704588978880411 +swelling 0.3236342881038978 0.3380686484951667 0.7423421723906338 0.3731068308996829 +occulded 0.06079559150488767 0.26503593244188317 0.149991343880584 0.02224164903596626 +when 0.9166029527678228 0.29376125425847455 0.039824220649997244 0.20343078804591297 +gets 0.3541616457369775 0.4819229451252699 0.7052691169551574 0.6004878577441817 +los 0.43749589416290846 0.2637158533048861 0.2685931032261464 0.02416260521555602 +vegas 0.021536880184720664 0.9558141537438211 0.758837834878462 0.29834939043983033 +suture 0.18481066437792681 0.0578294310543398 0.520482795003255 0.5352887176418749 +removal 0.3291724166854313 0.2318165569570747 0.28392141507341695 0.14183360080843277 +redness 0.5672233870675861 0.9557305893700002 0.3157491677761759 0.5248288618928395 +pus 0.24558527723878776 0.237340096527362 0.6850270997173526 0.8708779465495875 +site 0.7109964035593102 0.4479532437914089 0.8310982118462751 0.5642485693082588 +head 0.7162327328246152 0.28941531068592175 0.2991977510351391 0.7763948122597594 +normocephalic 0.18414600255519875 0.143187050901317 0.7220137546609889 0.5115501522612931 +atraumatic 0.4006501987986332 0.5669156127415417 0.8685163676143549 0.9250724171027923 +face 0.09127685112696438 0.5017606959766271 0.8557905761451903 0.04469787101654776 +irregularly 0.2509585290168054 0.8328406622209218 0.42526935655258236 0.1195886666441498 +irregular 0.042988958665384436 0.30551441321773654 0.5216793903418697 0.6980341647751414 +s1 0.5606474139304081 0.4418170164331877 0.2811749360594311 0.19189347084566644 +s2 0.7537638883396132 0.024217831453810068 0.9345499580886699 0.12434252292023695 +tones 0.36254081675884064 0.9179723285820738 0.06208239121008963 0.4772164695532005 +heent 0.2722360253761503 0.4886895548711755 0.09847843526932853 0.5552138278993671 +difficulty 0.3048507133980405 0.6383393026699468 0.9795491162665513 0.5664559929477765 +hearing 0.09775540064597199 0.5307745195083056 0.34666528842179245 0.6823720810331696 +mucous 0.3842255842615854 0.40694564896544694 0.7807010694057678 0.6846055649932554 +membranes 0.6871483184538785 0.5003357809337412 0.12409014301238097 0.432615798794477 +moist 0.16469969274434826 0.9547464961480766 0.06778332740171578 0.2928742765400224 +muscles 0.10558202624824164 0.8194163640684843 0.8293095459804725 0.35903143043163643 +pupils 0.36144959105520713 0.650555064317259 0.9288068652895296 0.7621573286671494 +equal 0.8254393994737247 0.9837280913395384 0.7818041047206383 0.6211350323259358 +reactive 0.9182638955592537 0.9581284051564849 0.07069754075353119 0.21551291646032866 +light 0.9271104827821869 0.671623042477865 0.6312812448770224 0.7765983785936602 +sides 0.9576327649254258 0.42456396128600626 0.8963540862701406 0.8873016122470047 +digoxin 0.20560648084473354 0.8707044652898877 0.7753397287419214 0.8572157507732351 +subtherapeutic 0.2323632051763913 0.23516356483017786 0.21702082797718092 0.5966446494539985 +tachycardic 0.9369029133202547 0.18223278326523018 0.6019021840839462 0.45176781707597813 +antalgic 0.9576265678813838 0.6199208134974233 0.5413816288267559 0.18285731826311535 +episode 0.30791804131347444 0.713483556602725 0.6228220906715249 0.7684385832705969 +yesterday 0.1523444177428832 0.6467883771436993 0.13278134868791658 5.137177678375959E-4 +lungs 0.7640326018750194 0.21821309467144112 0.815071243349127 0.899441967240983 +coarse 0.8688188750475194 0.18688134448078852 0.2759809389766903 0.821799142861821 +controlled 0.7780575911758787 0.3953884796312199 0.7359140271732623 0.760513986738023 +fentanyl 0.9395141084209918 0.0867726339090743 0.4448524570291039 0.7438061344954555 +mcg 0.7945558274349594 0.506337781185856 0.22847479913711555 0.42792692610046024 +patch 0.119894748081257 0.8330372412377978 0.31242034876277336 0.9484308366096603 +neurontin 0.3534237883629521 0.7204770657312185 0.7841045257263549 0.8456818514451394 +300 0.7562277683699072 0.2442010875021321 0.7946550836770178 0.4125256958337552 +o 0.6466401173028812 0.8497702406417692 0.8779530229890401 0.12553541541857316 +remarkable 0.3572539301309564 0.9752508894627852 0.5694255577922921 0.9159644841549025 +leukocytosis 0.012086644630043497 0.38207812089093807 0.8441531346266833 0.9125175072746768 +thoughts 0.6986351752547659 0.8698614940003004 0.7620163709306419 0.9452765672650553 +logical 0.11834862406551427 0.9735528556063587 0.10702266579816566 0.5385958607339671 +herniated 0.3390841012092273 0.4454964656904161 0.7386164013885521 0.34050597410981054 +disks 0.8444001124223118 0.8102340794036823 0.8346909967394298 0.9371438103106179 +l4 0.6770818706230326 0.04653876365342369 0.19918462794170133 0.7829519742782668 +l5 0.1080808422611097 0.4990028739459109 0.33155596292265366 0.5408676140364765 +saw 0.9270907358972147 0.45060721500750034 0.01109989790447763 0.47144708383673184 +they 0.147270925756522 0.39804997162719047 0.7359777842967933 0.63584951091669 +arrived 0.6441672880406261 0.9632154198532795 0.4750004049192491 0.5848291540913191 +posterior 0.007669493381691317 0.5470143240809093 0.8396158911035212 0.8943625808687569 +pharynx 0.6758830939932788 0.2859290707911164 0.9642194932418621 0.3782633819258898 +erythematous 0.24108358336046054 0.6069484003558179 0.09481512333664444 0.05911404486623317 +round 0.670458555961733 0.6931782101396031 0.12922387257332335 0.011070650331219456 +tightness 0.06813366116634467 0.4702662853059495 0.12644756112096078 0.00157288677439138 +improve 0.5200164111612939 0.3028804153073724 0.10846053982917425 0.4537186263677139 +duonebs 0.5092816752356463 0.450831054796768 0.31461608309902667 0.6270382930380292 +prednisone 0.3678611003699793 0.47603118961758295 0.43701855895173936 0.07163160049445927 +orally 0.23808831874657632 0.21433743121809656 0.2646260069593437 0.3491050077755673 +sclerae 0.0586903715349526 0.6304181674353169 0.5945121516189492 0.7334754125505448 +social 0.16060544088462836 0.8591462480603169 0.23063108055779113 0.10618188337279333 +abuse 0.16064513076954456 0.003614777597929386 0.383591153097372 0.5415737239929235 +beer 0.9748099421699375 0.8077267046926526 0.8093600213036174 0.1427199998018719 +mr 0.9381039442344634 0.15041014536193675 0.5804851743344452 0.5321119593250093 +**name[bbb 0.2139855368511301 0.41886284324247725 0.47032990951472264 0.027806446494466952 +aaa] 0.9041285385174433 0.20983122704377044 0.5489876959008956 0.9427850496604058 +male 0.5279584114675899 0.4807804000778153 0.7389524918525132 0.6337800580382067 +bipolar 0.8314662634206745 0.22691350308256553 0.7592864125265963 0.8955452343007381 +presented 0.20609317003522842 0.9953503371321052 0.0989401103229931 0.04752541359217466 +radiating 0.3510160212324581 0.06437639506566695 0.4030459122798199 0.5921062693388217 +arm 0.1959552514198758 0.9135253471344256 0.9304167988643111 0.6371012944331116 +notes 0.5166638573604285 0.689359710093279 0.546149195397648 0.3698602619132364 +us 0.12154624660628466 0.7321653176092868 0.05014155358007366 0.7463598424727848 +many 0.9653943811266582 0.8703775536944242 0.5105039752564677 0.21586321835521805 +years 0.05501798105189659 0.11240847239228402 0.6702494751008344 0.7402127408180788 +problems 0.5515842637560521 0.5696454856794129 0.42632021433625067 0.5865458005254197 +pneumonectomy 0.04577208040600744 0.9297439743600373 0.8419172303395235 0.37320059215904133 +adenocarcinoma 0.5843842431286673 0.5659377103182803 0.14626523254831458 0.6308054754389998 +lung 0.6608508475305789 0.41594250695503865 0.7003592101260646 0.704021704297727 +copd 0.047095621557046274 0.09313014228561656 0.9629023415913668 0.16413608309001348 +remaining 0.040526072245782085 0.30300105778808706 0.07756033079804237 0.02867673427455608 +nephrectomy 0.4274477830167436 0.5295859236944078 0.8208194991677189 0.9914514620589825 +carcinoma 0.2229917134952326 0.12925801211699828 0.29915205739834205 0.5548851992393541 +thyroidectomy 0.03768273844287873 0.4235392262087361 0.9648541080878271 0.42625900135242334 +**name[aaa] 0.7420668592871649 0.11783501045226974 0.23183761890836851 0.7904302235674034 +six 0.9533530094540579 0.38963411472362974 0.2158203572374997 0.10383963139834118 +syncopal 0.7518152393826297 0.2875616110331607 0.897772896310218 0.8680878209403162 +50s]-year-old 0.9497392987007501 0.6617742246455265 0.3151451176772173 0.4360903834167914 +african-american 0.18198746511896724 0.7065392784800949 0.2223090358627945 0.27979680522057626 +one-day 0.064793921923336 0.3374398877006105 0.029818857903131768 0.30568585872615994 +ms 0.6781923085925413 0.048500523300006204 0.7315031268242163 0.7427257554994311 +two-day 0.6675163679605275 0.598628689272482 0.9590677778037057 0.8472146403467181 +body 0.33070308383027736 0.8621200190243421 0.2658033759851107 0.16989313659756144 +aches 0.1606673119706643 0.4445758165249398 0.23415333476891476 0.41294727903474593 +white 0.44580574764779646 0.21343330558950568 0.02631455945160166 0.9679969929476288 +hypothenar 0.6967271277316198 0.7214105453897858 0.5527054285411988 0.07089159553613655 +eminence 0.863594611460331 0.6115199131900324 0.5200505640221968 0.13511023518174248 +eight 0.5360054701509824 0.4839048307946576 0.23899563433192306 0.23279216216077336 +ago 0.5761221401831869 0.3776067908857649 0.22057973154215238 0.8808979895070104 +dvts 0.9258078173900525 0.9744706536846045 0.2715565198245362 0.2597295788067242 +80s]-year-old 0.6498088859501268 0.8677483303009376 0.010469840931267127 0.7420131409509229 +presenting 0.6295654468757725 0.7545619029478984 0.46955347284108984 0.14297623517218894 +bulging 0.7060201147428278 0.20923942748998003 0.7219383146088107 0.41532632449289886 +aids 0.48483504007793543 0.7736376022619855 0.27931518705201364 0.6077647330777893 +abscess 0.4637149128894681 0.46091767214227375 0.34211449146212025 0.7816467559340635 +surgical 0.6190742338635965 0.410842043903235 0.8524290760436208 0.237625027337788 +evacuation 0.0980746817028808 0.3575108017268679 0.6993197840505668 0.8425898361466198 +**date[dec] 0.2020991866461629 0.7745588258009048 0.35680793448658976 0.46694670497198454 +2007 0.419016442137883 0.04818602383944859 0.6967724355315583 0.5432481217205599 +increasing 0.8641305441963131 0.7949943911318834 0.7121340198623491 0.539422677263713 +weeks 0.12349799929924965 0.48945721769269923 0.41012243322995834 0.7033217968147574 +one-week 0.1583342086083448 0.894947835768629 0.19108677663306572 0.7800303989818832 +intermittent 0.6255206792525814 0.40260607411995986 0.6152235849102284 0.5293647102389598 +typical 0.6967626321874326 0.27392276914138103 0.5777605178731919 0.778206911624234 +says 0.21773617387947686 0.9557739226408045 0.5625485158383536 0.4308025819603022 +complaining 0.8386036283472607 0.09073788206835531 0.8246783980069824 0.3298741006712218 +confusion 0.5685634968019011 0.5808801186939297 0.06465295939350535 0.11237916738938047 +dementia 0.05964227972247804 0.07979226788531968 0.3998086563938088 0.23597018803622904 +hypercholesterolemia 0.8087225935677411 0.5855232254959096 0.213095747517013 0.16568730401446774 +fallen 0.381842674815126 0.3861417108440047 0.764989711274844 0.3051438720532611 +rhabdomyolysis 0.7110905867706983 0.47002155653520894 0.29122040437202323 0.6324401593523368 +deep 0.49195534395212337 0.6545422935845523 0.6771701061746837 0.15826724794652292 +vein 0.7177871471424444 0.10974899278504191 0.6992050400740594 0.26213996986272137 +vasovagal 0.42805693960844526 0.6576175898055204 0.2538696424023177 0.6824240477829129 +30s]-year-old 0.8183950532465798 0.21292246928111602 0.9577863510083121 0.4367995674431373 +subarachnoid 0.9629282479399219 0.9983056225732929 0.9942680350211488 0.2320288187617675 +ventriculomegaly 0.5332221402396725 0.7139974698309414 0.10706435653865032 0.9853736925784514 +caucasian 0.06317689031577423 0.6717417426357039 0.19732713535982394 0.9247341674387585 +woman 0.5055112591224659 0.012827529619973244 0.06679387978985818 0.36055586014849883 +antiphospholipid 0.053010838947716854 0.5217339722601739 0.46835319899460903 0.6297815024737621 +antibody 0.4565039924611164 0.21974424094346268 0.7745690223284764 0.900597262878075 +lupus 0.36156906815284295 0.8769867975978163 0.25540536893494226 0.49645203842115393 +anticoagulant 0.5072648150567713 0.8167140225996008 0.16830713961265464 0.14880384351105125 +initially 0.9827155920069921 0.9022296347188418 0.6069564133586418 0.1202741413528593 +**date[jan 0.12117157220322328 0.39472748507116084 0.32943811255097677 0.45336789403307165 +09 0.02620318757407214 0.44940663032355666 0.5922123868173035 0.6946431157309393 +subacute 0.17097888205456024 0.37757837473754097 0.3532058020202752 0.32019042478328696 +subdural 0.9891129129070056 0.4440833889749305 0.07549708289500823 0.7455101317182342 +promptly 0.8843605061059039 0.8921875045613171 0.7624871762085926 0.24135029670113606 +dobutamine 0.6264757264563534 0.09222747412154486 0.7861077460752013 0.29679599746307184 +drip 0.6718367802190939 0.1956286768429516 0.5315360348862103 0.9718768885398524 +diuresing 0.549982682160734 0.44251869943999156 0.8750481864703452 0.5641863190843673 +lasix 0.09099394740853384 0.3190123854673762 0.036057185549349335 0.47748192721425586 +overloaded 0.9336895225673525 0.08934523075108636 0.40004560870701333 0.5249643861924763 +persistent 0.45090074721157847 0.8383836103321013 0.4960174665482434 0.02590727546625038 +inability 0.9556222560154677 0.0516422377452852 0.4302573805774146 0.9615377817797441 +walk 0.8106294381348025 0.9327901544822418 0.7275997627168314 0.14454453598358585 +fear 0.8541899053095591 0.11272892950715252 0.7120739856947912 0.8263151362724965 +appropriate 0.3757868985731775 0.13967753116830683 0.2707065352756387 0.41698300538498356 +send 0.7283104998249619 0.1563004374027881 0.08742442301658182 0.025589210842931753 +especially 0.5469621407778874 0.46776687370388714 0.20841863154548634 0.3411419638670541 +early 0.46015684865002104 0.28373068479386543 0.10599027512104986 0.9700825882116286 +hours 0.12399672518541571 0.777571882789666 0.24481615701391535 0.7216438115743651 +friday 0.3894280996523032 0.7478425506799962 0.522734760255335 0.4898122677978384 +night 0.41998414891331604 0.5897112618291472 0.8575672870506769 0.31908414924679684 +human 0.6180534805118912 0.9883016529975878 0.6221529676888854 0.9609291975791144 +immunodeficiency 0.2889044498661393 0.7311728113647172 0.5916914208345639 0.7288549342259323 +virus 0.07423429366452239 0.303789783005428 0.5606494647303569 0.7421893747935765 +acquired 0.3208003312249258 0.47968537238911513 0.8109029794710289 0.7121773979704867 +hyperlipidemia 0.7147307921655932 0.946114458046194 0.168449720811099 0.26871231687836383 +hypothyroid 0.7084009658073512 0.1163550355300873 0.47294181127860746 0.4520004314794308 +hypothyroidism 0.49429964333934473 0.6714107358728277 0.9024513805311194 0.40068363163512155 +believe 0.5391000607434151 0.04672163449994604 0.7452969160055275 0.4564210279152624 +meningitis 0.9811887040817981 0.03359158646562399 0.8914216746084552 0.8746574060596889 +mi 0.5161937831602164 0.9003919929618728 0.527729383899297 0.08228295821265974 +images 0.7475951499578802 0.9552030762876799 0.2649474700144203 0.9551151353762513 +spleen 0.3632855818061226 0.2694060549633953 0.19800074730058714 0.9016099634911671 +important 0.9139931976735713 0.3608404001342308 0.20754379375023913 0.6339372171869323 +note 0.41576339354916647 0.9954921967255261 0.14721346769865318 0.3835295730019195 +physical 0.4762376940315136 0.9082815296135853 0.42932750291089294 0.9754432777916687 +interesting 0.15325626222939626 0.16636784248740855 0.9211691971260634 0.6143439282846058 +finding 0.294349878006136 0.10249404402860396 0.9946618684364606 0.4546575567550203 +enlarged 0.6769757917447884 0.2312418684594748 0.034151469774753695 0.028403938331514 +healed 0.8718915219915111 0.3276720244551439 0.7001789612636188 0.3864556838299855 +calcaneal 0.3858494086963853 0.8956749548378653 0.7715390662082056 0.37244269194944046 +tib 0.30483367690161123 0.5824588375567364 0.4435853624084234 0.7932490221614091 +fib 0.35535126625419955 0.7059538260407289 0.5476559386419646 0.3760247724912751 +fdg 0.02882624952792201 0.953512190743019 0.2746581080447792 0.30886296403311575 +avid 0.17293708656671092 0.48313629162779415 0.6352644537573578 0.8507675129067511 +malignancy 0.24377482309859522 0.11256781420240836 0.06983668168236279 0.966265137014521 +pathology 0.23398945137833482 0.01692260338073792 0.3331078340700474 0.19099041160107744 +nipple 0.1446308078830687 0.3721579055473817 0.02181743551718318 0.9668001228376895 +projected 0.7521793687911096 0.6795098845435474 0.8879081743386665 0.49844905252474814 +base 0.3373011614229794 0.08514056594297859 0.38555681185208956 0.7785866692123323 +addition 0.9881506928902055 0.8695857949009151 0.7372262493113042 0.8038097872361487 +medial 0.06858399620129851 0.42319646031443836 0.17208933687847894 0.16173387024984653 +uncus 0.4504336637537608 0.4466173606794164 0.8895894818445028 0.10752133609573955 +(series 0.14903648569386674 0.8194492508795547 0.731566988684511 0.9680322733521438 +image 0.07249346656381095 0.5655917513476982 0.5082941163291814 0.21273957378401587 +12) 0.6976809172286802 0.6103655374127575 0.41692590433050103 0.8127764999950818 +more 0.619820531824233 0.5723647162335916 0.015288033513375265 0.5729620064793142 +conspicuous 0.08745547396046194 0.6620909801463778 0.7695867521853191 0.9663753159105033 +vision 0.830968333841442 0.8697733810779898 0.2815403336125887 0.6270276023906343 +diplopia 0.7203159445631546 0.01637873359687525 0.9185951590475577 0.0634057157449538 +meningismus 0.04644546162782315 0.9146693484114033 0.7203040567424296 0.3441389523681262 +hematuria 0.5108944884624538 0.8412663084279399 0.13460818296916988 0.47676693534734316 +myalgia 0.3293678013400243 0.75602837718994 0.34967810050909454 0.9515624744404747 +arthralgia 0.20762179677204906 0.22165731001321365 0.7301334075698555 0.4545909344118777 +rash 0.33177714992667606 0.7742932486782939 0.14087486025456664 0.06766303550480068 +my 0.9302189332178855 0.5749641335576388 0.7474264922977051 0.3675654339918829 +concern 0.4105327920016595 0.5321054756183018 0.21859949532990286 0.7129295161316005 +terms 0.5524649046414675 0.726968012286118 0.1254538810842496 0.6643257312358782 +indwelling 0.7466818753297494 0.8965825883305215 0.7299474884545027 0.9216470733121502 +catheter 0.8091466700496133 0.9330526401195203 0.28732664541045394 0.3825718452481013 +er 0.4722151541936572 0.8432971241739684 0.44822627487486744 0.9268729176835405 +anterolateral 0.2560970492058151 0.37996737046594864 0.339611635240584 0.31356661292781407 +incidentally 0.05811418189846074 0.6021133909097904 0.4281077052382869 0.1413712483000591 +gastric 0.4622278134253177 0.9532767266373648 0.3249265620409957 0.16767181768081163 +incisura 0.46759635031908986 0.055038450768984726 0.2785170369385771 0.4071641689366685 +incisions 0.06246315879875031 0.30565203925496987 0.2192247328553576 0.33857785809875607 +dry 0.8284637906916325 0.5462435522321284 0.8696651954975663 0.23635447486245498 +cholesterol 0.5504806880442324 0.5111615512726673 0.6151762459178731 0.11388365952014678 +cholangitis 0.628462645312836 0.5144122104520661 0.5406303956202211 0.0158766646832178 +zosyn 0.9401654811923986 0.8407155733918071 0.2830515232382049 0.4296828943620945 +flagyl 0.6875588063049578 0.607571140975786 0.12389502157150345 0.39883237166488184 +intake 0.6746840311269747 0.4438223077902198 0.0505133871426392 0.5168127298489694 +voiding 0.4110363703400174 0.19056038925745866 0.3532447282307828 0.3550666659944468 +ambulating 0.41208389985683225 0.9718113178195099 0.6927244130219616 0.03642165331483238 +independently 0.07732512154744109 0.40529886806944904 0.4714352973934408 0.6036411467186775 +intraepithelial 0.5663255511671887 0.42768025414303956 0.7741750723665878 0.5525120051098519 +lesion 0.16446335855437866 0.1940015782480523 0.10403393500401403 0.39986041637827807 +trigger 0.7326525955054902 0.7777215989017318 0.6336396889120605 0.43911434298231855 +certainly 0.9691020724088765 0.9455047621336671 0.7158411920910228 0.8995079390341402 +concerning 0.37620661217700135 0.1100851650399951 0.9536658039318462 0.5697489326903378 +presentation 0.021956947797380222 0.6748550788776403 0.21629953198615737 0.572947584130767 +sudden 0.41279288156318594 0.37543319433528743 0.6060036680267381 0.38683142165958395 +before 0.05361439547242863 0.8963193415481415 0.5302951065775826 0.3917473262800394 +say 0.5607032445919682 0.9948437391095533 0.7143817907239233 0.19867850453125746 +fast 0.5145978275316876 0.5999708679543979 0.2007588759387392 0.7604809042215267 +feel 0.5925023104142493 0.870118999802701 0.12603510877282997 0.801769825303346 +like 0.9036003909147695 0.9920087094501661 0.3394731348935027 0.4163641847139886 +here 0.8656833263733156 0.5816545343543528 0.7333915181862977 0.7759226625168066 +contents 0.4077904676165345 0.9089660555049297 0.49248996083869034 0.7693821016694992 +whether 0.7610999684744174 0.8090259445188072 0.3373238152019251 0.0500353665182921 +angina 0.6090963134455095 0.06596413045776628 0.23656775814348419 0.4055626108110332 +versus 0.9080801652754785 0.2594489327077054 0.9933723814302051 0.5860558493818742 +involving 0.26037564463722307 0.5491675151639203 0.5730320483683127 0.7260539697491998 +complicated 0.3251282953626009 0.6392692525289562 0.03234562111139827 0.2788315361056447 +delusion 0.38537710294463623 0.2125977687425713 0.4354668380849511 0.9446612845700669 +component 0.8551221619039686 0.45363857327627843 0.19670386572736664 0.4194968466976501 +planned 0.20639916264630775 0.8894607392434526 0.8776428252990905 0.916498938303563 +reimplantation 0.5239106746638843 0.66953280388966 0.5841333888118682 0.32005446130484383 +pacemaker 0.6535532685334424 0.5781446992111265 0.7580887390042361 0.5204149541231097 +completely 0.7012187994327308 0.5272687550800595 0.339716626185144 0.9468068924849324 +event 0.45953704398794404 0.013044881229090532 0.9877117971966997 0.5678435584686842 +kw 0.19276577230982062 0.035675841600841696 0.2881045515523051 0.6602941026847367 +cef 0.09382409744229858 0.28037110983826163 0.656873913156674 0.46074225771428456 +addenda 0.6425029674315702 0.9360784574901153 0.9470981744092787 0.4093940263010698 +addendum 0.040176646313021314 0.3890943507811756 0.6083297023803809 0.07126956691212893 +telephone 0.8738777359569272 0.17024598703678306 0.6140446259080825 0.977241712036015 +conversation 0.17138701731191608 0.9571889112193505 0.2724324654828636 0.5658597563886232 +information 0.049029284527522754 0.13746618115202092 0.4664545984595113 0.12662368415228042 +provided 0.22515442849139422 0.4233262087326971 0.831014534732541 0.14635088138429986 +**name[www 0.7748229184650572 0.2758983180066129 0.8682472082220325 0.7425035885178034 +xxx] 0.6350033428960998 0.44417637336185245 0.02634821169116075 0.9308221550389589 +15 0.5346289935948741 0.7533624364094045 0.3230285172637529 0.3193355208796068 +sign-out 0.22014952350560812 0.3874741741970201 0.18159348245458984 0.44679717578337097 +indicates 0.0747526709902302 0.30272088942394415 0.7658100304154121 0.39082106600317346 +previously 0.48822011659334563 0.08100518459690809 0.24913883482562993 0.45946894277890515 +splenectomy 0.6244208557791224 0.4870082459282584 0.13457339615068376 0.7497665568760536 +**date[may 0.5547339751428851 0.22766872969541196 0.9532413266613927 0.10760337163211753 +2006] 0.07823248136340266 0.9294663464083734 0.41308745514838996 0.5556966496671958 +reviewed 0.9332483929845725 0.14677367184066548 0.34337234148512974 0.5697770428599863 +consultation 0.5199784233344941 0.29363993719349857 0.5863840968409041 0.677880447969232 +mn 0.7000788859767318 0.7517234509642352 0.32467499434053837 0.7336057388706372 +laboratory 0.7167373310953105 0.04868996509496992 0.20029898564696158 0.8714265476866923 +diagnostic 0.7874975575950552 0.9242463833710172 0.9819397804406312 0.5906199862229836 +hpi 0.4621535348128213 0.30508023232835013 0.4664835501379323 0.6341031392398313 +very 0.40931710993751835 0.6953277179040012 0.7635043923362879 0.3294438683574826 +lb 0.8978033231614455 0.25579377097446354 0.10429430428635023 0.8415394415894928 +acs 0.8727660747613298 0.8929996988436171 0.841484216200187 0.34993309459343236 +pre-op 0.19155599535087664 0.008235538379016138 0.3609659163974972 0.5084647314125659 +hairy 0.2805619455796464 0.1671455309988017 0.21473951490762822 0.6709291493785561 +leukemia 0.2018355284131037 0.8419769955344563 0.9014742960385144 0.27364133879100694 +moderately 0.8770883293269449 0.5514739955384024 0.3709858606640034 0.4542850531008833 +interstitial 0.006689643221843289 0.17749956472449535 0.5094957047839026 0.45614750766553114 +thickness 0.04258221002491558 0.9728206343717941 0.5744536982005569 0.6178195062946076 +ctp 0.42657205806508314 0.7910927147394613 0.7908684665656917 0.780666872396558 +rales 0.1892378974431852 0.36793179714191315 0.09550363854109856 0.7598556563550896 +rhonchi 0.9197444971771134 0.061063081347036086 0.3209633505023647 0.11944030712248399 +wheezes 0.06313356692221828 0.31788790822383783 0.9486033569107266 0.8680002080628202 +ctap 0.47920450382106716 0.2804109301200933 0.9273686879047552 0.8485367725979969 +expiratory 0.20588692609980275 0.3603222254403765 0.9550727916467743 0.534819914243635 +wheezing 0.4100453496568667 0.3211300340300637 0.04929703394048435 0.66609913326934 +measles 0.5799460264671771 0.38959103810586304 0.19700358473825996 0.6530710114028048 +mediastinal 0.11615448486636637 0.18528757838134624 0.3651189215650932 0.9702215320163405 +contours 0.6114217629467857 0.8050519411880678 0.17875890206051237 0.8131382715818424 +making 0.27515638118633323 0.22803980041674354 0.12943061505898468 0.1441140196466274 +coming 0.15436781864739957 0.3176062896616162 0.2624604158468572 0.8753396748599057 +what 0.03050653269602377 0.08774011773047286 0.9110615076358805 0.3462114363537845 +outpatient 0.13041494816587507 0.5746619408516016 0.19386804357165166 0.43975318748412373 +showing 0.07964228024548081 0.608295758292533 0.673427943802034 0.6528820559603513 +hemoglobin 0.847665349140709 0.013672768820395031 0.7850591438301697 0.18622155319317568 +gerd 0.7870972169207304 0.28892913213091953 0.3886394462080345 0.6513927740905783 +eyelid 0.2527534119296352 0.6044109319364676 0.07031329238084438 0.3071918285876113 +migraine 0.7014853098185796 0.046500218245788094 0.6267634049661237 0.8307868055792663 +scoliosis 0.6199795582413143 0.3901286384485493 0.501608795042987 0.506045443451372 +spine 0.10526051950113524 0.325546318560767 0.8116551255962735 0.3683302773808256 +recently 0.4314227507882098 0.9877648175092346 1.3738099045601526E-4 0.28357524120651834 +mottled 0.4761589837253214 0.8411952960233288 0.08475863047227505 0.1591838685493977 +maintained 0.4519384518459234 0.4573539247590973 0.9302430468698795 0.4852658690014907 +steroids 0.9892293456518388 0.4901086114065155 0.9682851918874712 0.18695584630127438 +option 0.2529480672715013 0.20079238677297118 0.31219166664725906 0.33203353303065375 +cerebral 0.4383576747306541 0.21461038768263818 0.2691754908002847 0.58279270188452 +distally 0.6889259609619898 0.8001192652789492 0.8401772556551708 0.407081451332549 +supple 0.20619350669119807 0.8551587345648595 0.9152516566728565 0.9706635639036775 +lymphadenopathy 0.08049079901206668 0.9554673830592356 0.521971391985683 0.3215274145453765 +thyromegaly 0.16138265362223625 0.2763156738459873 0.6649654922448776 0.9464947939901415 +jvd 0.10800683916545184 0.9328410528591492 0.006621869965722227 0.03145943668984286 +lymph 0.6120137650550095 0.30137174166819614 0.8821492951181223 0.7888108754108528 +node 0.6696019322695033 0.18187674807095922 0.9166070516071022 0.029076018216605215 +cervical 0.694184268682864 0.24078186912332555 0.8171751553106742 0.7485379175495113 +__________ 0.2075081419003455 0.03374286396943227 0.7508518939104015 0.880566846926256 +neuro 0.8245922469478748 0.04956810322741112 0.08592935672318536 0.5156260433151533 +neurodiagnostic 0.2566314284929698 0.6393350796291273 0.17501214026664547 0.8333908985833334 +suggested 0.09606051142703398 0.3988464178696094 0.28800785853371214 0.7101467961477771 +lumbar 0.21925662451862027 0.5579230595183824 0.7654327953970085 0.4656073977538524 +disk 0.7448989618619287 0.19663281170072133 0.9555393727047585 0.49552194543457073 +ruptures 0.6197938415701003 0.6564054295831255 0.5231871262053347 0.5999163697198443 +l4-l5 0.0690116929301573 0.34308574139580184 0.27404187205803665 0.6615399089628112 +l5-s1 0.34347263275486106 0.22932862525442066 0.7246025687540129 0.9550650561136824 +levels 0.4256548089425384 0.755340795061216 0.2667193356732205 0.774815955156154 +neurologic 0.6128902971256659 0.49982630032372277 0.9246796272758101 0.2572759789019484 +numbness 0.18775149152074677 0.9470937676015375 0.38360449901097726 0.24435258998489606 +tingling 0.706320165260954 0.911635348605169 0.1798100932781378 0.7843536182656117 +neurological 0.78767049650627 0.6980886910171624 0.6869541100427478 0.3998562093405441 +nonfocal 0.6716866923089526 0.8723448771515917 0.8062008867191905 0.5651252729394368 +7th 0.2273741110652565 0.5986667951835198 0.07430279406368545 0.9304663845123688 +8th 0.5401130560466287 0.8580342052953213 0.24040964614675553 0.7068727332812687 +perineural 0.6838420974174387 0.5045327658182361 0.23058203592180904 0.2815540627219528 +extension 0.7929480043111452 0.039456503951948774 0.8183984368343592 0.5734109540322968 +newly 0.08867779571473278 0.1868945508586175 0.25071015137177766 0.8572447001761254 +diagnosed 0.0861776717066507 0.8399239035889551 0.5395117708607727 0.786469902553532 +anterior 0.8108025176169258 0.7131329999994204 0.12405983824272981 0.12632287036461265 +delusions 0.48612911739465214 0.08505055863750566 0.13178513680419068 0.2730359855664366 +hallucinations 0.8211446042557298 0.3431574768545649 0.5222093951954349 0.9791745066238262 +bizarre 0.7203715646411183 0.6545098767289305 0.9849903894592962 0.8313089992149408 +behavior 0.9682294216956443 0.9766037061410217 0.5813838038774237 0.9334228374053813 +therapist 0.19772160964082452 0.7959622738743155 0.7678242494025913 0.9379312811482684 +polyps 0.2435763770102629 0.4360303306246507 0.6209920677251535 0.03591458305270179 +metastatic 0.8012586708647316 0.5113093137914398 0.7956460616485305 0.7691448827819357 +focal 0.16580935474233538 0.1212700751477751 0.9896670851782597 0.5587123607655906 +though 0.7082570284072931 0.675431106587543 0.4651710557712536 0.6585333882822823 +movement 0.6623869176365437 0.8200919999288495 0.48021395136605016 0.8367497696985774 +index 0.7409404936114884 0.7975625791329998 0.32252560691174637 0.46400432509176603 +middle 0.077858082817434 0.7103311353347764 0.00487346522019283 0.8448053785444044 +fingers 0.001966229811827702 0.5854652762675475 0.5810704196702576 0.5666686062758682 +sensory 0.18755054629268697 0.8386415109296529 0.47230671372201316 0.3614516181618843 +deficits 0.79854368251674 0.7225508341653675 0.15539799274277055 0.47427316557567767 +foreign 0.3455971595035868 0.6110290423638556 0.05770713025874008 0.8176352411837449 +injury 0.10909513242000968 0.5181123603175698 0.5279108152925797 0.2518431583917735 +space 0.14620686770119884 0.0803204304682763 0.3074461297651151 0.17754520960266418 +narrowing 0.09636388171673727 0.3408199136124478 0.014386258034998933 0.25218971822012315 +masses 0.22708687499501334 0.06572436475426391 0.7696588889880589 0.44942454561124234 +splenomegaly 0.4208765647617483 0.9956364754342456 0.24493452806986604 0.8689398578324516 +injuries 0.09392162745437471 0.38964132724511524 0.0921761584839702 0.6739822376621091 +paresthesia 0.35866569040347307 0.4543104636997706 0.9267559857159954 0.2847444227654129 +wheeze 0.09481532695539097 0.7668779237612211 0.7792085480264834 0.48806143155011017 +residua 0.9771585419527716 0.5996013670181117 0.2051040657223745 0.43732075346624266 +airways 0.9236040365402185 0.13142267188406198 0.048189581816191596 0.6112203784623833 +bronchiolitis 0.8731390413086279 0.893394290122474 0.18885985524109683 0.6759857475923893 +obliterans 0.2809437249029707 0.43593862882877543 0.4781835553882279 0.8528820345221382 +tongue 0.2266885467379942 0.12886006971577624 0.45712493601527515 0.8313362544842824 +loss 0.5720803033185612 0.20912990020919853 0.3912647364783727 0.48770891043182163 +_____________ 0.17501173634629397 0.6768170126493426 0.6132384165070573 0.6707323666488644 +foot 0.7702081441897638 0.40613771839891155 0.1582738746883544 0.0968591775001928 +dopplers 0.7600543141456645 0.8864505206955656 0.20262454535215269 0.9622404136131776 +dvt 0.8243232611886706 0.3371226620856098 0.8703584951557839 0.2227804721967953 +olr 0.9538181617662252 0.5505469146041897 0.5608772118456007 0.6196915582617569 +04 0.24047724528705716 0.7091891453065838 0.06770311243274696 0.43519738316660983 +eating 0.5152807248715194 0.5953976546478691 0.4084874193713848 0.6515124210375096 +went 0.7153384123233182 0.06680841228650858 0.46685557900817154 0.6559492789152123 +05 0.6828244882914112 0.4055301436867782 0.25862011943701146 0.8859838779889941 +calcifications 0.950966412862677 0.16589633772867507 0.8780559803494776 0.8403341099199318 +**date[sep 0.9560830085586207 0.44342161543173264 0.9543466486878488 0.6955843909885528 +26 0.029706186303625204 0.06670227633144676 0.7539740897329502 0.3626153425662345 +nutrition 0.4711097974464351 0.07435195865947386 0.9017389106931839 0.19307402355479053 +liquids 0.9781081879901284 0.9806651047188806 0.061261891595889684 0.7033471445747653 +then 0.7227760558071996 0.7565522259287323 0.8152316268351832 0.6219754335486294 +27 0.398780361500626 0.15639301993225996 0.4163271941305632 0.2861751655827901 +turkey 0.6383037971782872 0.006804476385048552 0.04165902906070951 0.9141080632316576 +sandwich 0.29907883702741955 0.33992282788954964 0.6767348324024282 0.5617683678215951 +smashed 0.25584026359760226 0.801649827862837 0.7793719537487332 0.6368318701100658 +potatoes 0.49554289507270743 0.5944154722240318 0.42964128154697645 0.6865385576495927 +lunch 0.5119804636679602 0.32436047963137826 0.7588418212120012 0.802656751997663 +wished 0.22298116892427844 0.514443857284099 0.06866956130728219 0.39380375088552344 +go 0.8921432957281452 0.21783221591477908 0.20115770595485616 0.2638738532590751 +hyponatremia 0.8271147517969907 0.12002902280147032 0.4006300652020838 0.9050517023265374 +sodium 0.9477082426321772 0.753553591116839 0.12082270653439375 0.6648652707450652 +134 0.6696837276176342 0.9655813126056715 0.9734939348161645 0.9789595610321933 +#1 0.9338183766660743 0.5768079269012516 0.28925584249858804 0.17474808561075872 +deal 0.6249233650340077 0.24890242352869163 0.5670251861975681 0.44582652565153835 +incisional 0.07175623642054818 0.9382645828284034 0.4254058139084427 0.4730091277700036 +vicodin 0.7963161285473613 0.9554048651189493 0.5374776202575456 0.82507482807496 +percocet 0.18979863452388335 0.572115653050936 0.4298828732695431 0.7107868536620651 +21 0.33252591771238593 0.32270488767183314 0.9577630594338977 0.8734762996675814 +transfer 0.7464567717889236 0.6309141111850821 0.7290148548074215 0.420456259796146 +notable 0.471420576545194 0.16074510372214723 0.7444969350630263 0.1617599899269232 +cardiac 0.49000781353596257 0.33727015935696925 0.04523394618823651 0.5961733543109642 +underwent 0.19108916603986004 0.8126034043981077 0.21332770066613727 0.02725748832457464 +ophthalmologic 0.2582242365902391 0.26256149806709606 0.16020013586047221 0.4180354442910039 +blurred 0.09675990037293258 0.7911438220875157 0.7539560136385197 0.5002420375727871 +oropharynx 0.6722072144745419 0.3928141164099834 0.4870679136685766 0.34251320228312876 +osteoporosis 0.4225254557633227 0.37666629099304194 0.5662501828990844 0.05161781340425131 +abrasions 0.7282301332287853 0.39190983338226604 0.5183019716497286 0.5318110603278168 +knees 0.2767814855117574 0.7107108154662167 0.6488641602652159 0.2596155261611953 +estimated 0.541498810850868 0.7222764671099088 0.46544886264426566 0.8448121855213467 +ejection 0.5102848937250101 0.17117302004880053 0.48176404411889784 0.7431506224245166 +fraction 0.7457342967238384 0.5332174960271645 0.6572701624652842 0.8622639074203636 +55-60 0.8357377981066417 0.06388671777100052 0.09406958919600217 0.9595002643308291 +% 0.30248474206728937 0.47703688767228214 0.5126438026853023 0.6688603802963747 +60-65 0.07780389078166117 0.9158302247660497 0.45861705614315196 0.7139555212703302 +15-20 0.724072553054136 0.8631156536587751 0.1546398573929736 0.36487832819986066 +continues 0.6824628667665149 0.9051123036197711 0.40464920177005304 0.4791048249534966 +pancreaticobiliary 0.9640829525522502 0.8100964844900952 0.9821766239632987 0.5300223205654283 +consulted 0.8646098302174904 0.21289415804084122 0.4900393173364106 0.8714051001877751 +paranasal 0.4793004229024228 0.30633219576207604 0.7656637356797384 0.11213571689033197 +sinuses 0.022805206507914733 0.33240384555811286 0.5244548252286815 0.882696644704017 +mastoid 0.21699905883458293 0.33875492801685636 0.08303827291154375 0.4634336313602412 +part 0.30008212242129395 0.6201990614441344 0.6848674382233261 0.7143182076267262 +neoterminal 0.5314932209773577 0.5003217945397449 0.16260018600161508 0.4969141081086482 +focally 0.41971692407180716 0.6248348319345796 0.41481548854936034 0.3114782579714823 +active 0.2838546606239327 0.73619355428429 0.44905327223968616 0.18560376112624366 +ileitis 0.17486941485969199 0.9780921369979497 0.5488251105719036 0.9455391237316083 +ulceration 0.6287023795494266 0.13880238419662383 0.4591579679220814 0.8772941365618783 +dysplasia 0.20099904186647677 0.8839239187688763 0.353396234486812 0.4418696576613165 +rectosigmoid 0.241971626510739 0.9519650077493784 0.7434023245327136 0.1474250290796998 +colonic 0.9099630840063769 0.49027454805345017 0.4259598587812563 0.9009232816807554 +erosion 0.9935679142444745 0.19419769354050098 0.020097050066957833 0.16277110687162966 +architecture 0.7147032268010027 0.6789290932611944 0.643212819620654 0.010197103263096974 +partial 0.31881283857351195 0.5613888815783633 0.798301409216164 0.7785879056264136 +parts 0.9234269123698199 0.05854871004127027 0.6002360555302372 0.2947290078831475 +bone 0.7925873375686625 0.5189862118788148 0.05598564736760647 0.8418735386038313 +aspirate 0.9321894546227435 0.1960481458002432 0.2318030102147598 0.9037881005126577 +involved 0.6600791500242442 0.7438537800507997 0.7898368289951813 0.4234025034902654 +favor 0.5947832113897552 0.3714719381493179 0.5572841495328843 0.7814157355179431 +comment) 0.3280405507782709 0.32595622313283323 0.5031135840322951 0.951187657512732 +hard 0.8988403026628248 0.3915252940077911 0.7405539150947864 0.31847254519789125 +vitiligo 0.07046242923208801 0.39002554231334385 0.9479567943632871 0.4680265105195748 +esophageal 0.32705489101354523 0.24691613155308334 0.9063063065663536 0.8338482518325514 +strictures 0.4525712877890543 0.04736725834659705 0.6885618555858604 0.48578846629846906 +alzheimer 0.5324312376902797 0.6486642344911933 0.40418560141303206 0.48249024717508937 +nodules 0.2270551446107133 0.8379617690443035 0.3016190767942871 0.1590989904765855 +cellulitis 0.20646409435061253 0.7285340082885177 0.46213802418708694 0.43790732937653176 +type 0.7064246417385728 0.0730176462156592 0.28042844213172924 0.7510768062823365 +obstructive 0.26915721924980884 0.43874716781849277 0.4203910305535241 0.44680467813362823 +sleep 0.8654158868723199 0.7381325402702437 0.9590949168818296 0.6137808639233563 +apnea 0.7597868088090743 0.43613124256696745 0.9874662193241125 0.10350658470912899 +wears 0.6773407859602055 0.867218482010588 0.02321046969385121 0.1560227750782055 +oxygen 0.05731555142793221 0.48586045981776727 0.20963220241863 0.4418228760537727 +thrombocytosis 0.43829146194765245 0.5473474807968393 0.42342932898764607 0.5212013613989024 +156 0.30936396780463904 0.6699451825349999 0.09520357268264201 0.6920155720310239 +86 0.08310506073220292 0.9646737243225901 0.7583424439670556 0.9478153885013846 +pulse 0.19864505010276057 0.9356261121862741 0.08509250064661 0.30052170752327667 +75 0.7469948765691872 0.15158891248365525 0.4430574820235086 0.035310799064873444 +16 0.4257685885563486 0.7688295300809306 0.8110188840888047 0.5600890595862361 +o2 0.07675432894422851 0.34526648668647786 0.22989084704847995 0.38791853886898775 +saturation 0.5373213394128694 0.027839824720358508 0.8327853952223039 0.4479803165494258 +100% 0.0454398945203196 0.11775144354013978 0.8975774469395481 0.35913960976454573 +room 0.9781293408499395 0.22234552658043505 0.29602393796441784 0.5199027496044984 +glasgow 0.274439414416875 0.6877517848513034 0.17540504499600285 0.869533063427291 +coma 0.4966085382155989 0.7730151648783834 0.7018192715129544 0.9573188865263916 +scale 0.11683167926282023 0.07428771917135113 0.21867907995093283 0.5204636816491892 +14 0.2821101816903716 0.03675917519466909 0.22961655855682794 0.11579503200217234 +slight 0.0059440177549274775 0.0023553299167257125 0.20192426034452338 0.47646524328104745 +acutely 0.46316698635748876 0.13283400113887633 0.3048186614006476 0.1720435439945439 +ill 0.8230136943059075 0.7207603410239968 0.8462293171743922 0.8388758479096202 +well-developed 0.8455464805266201 0.854944849898183 0.3524797878685979 0.71731512201899 +therapy 0.2878596850512791 0.14227873241356015 0.9471410044608582 0.48340139754863065 +occupational 0.01901756955039502 0.20450316290098802 0.928501526334084 0.3947976351597209 +cardiopulmonary 0.6264708985698432 0.9663002162031068 0.21951361990426188 0.5842636393716457 +ways 0.21173710605750629 0.3301951049721181 0.6697374619585216 0.9286773508226256 +ambulation 0.019417149816464363 0.6941107969535444 0.025720545054162436 0.2975888186121681 +activity 0.5211618605423258 0.9090174237848118 0.7544129722792894 0.14251918890794035 +post-op 0.5849891696888239 0.6969709558797308 0.8036335928797952 0.32669339681295007 +h 0.40039303341211585 0.25393675201790755 0.8142659248436452 0.8146218549502573 +posttraumatic 0.9009738320669742 0.7471452129221259 0.2607768484261279 0.3419229218027935 +principal 0.5288588588939572 0.6833286965728542 0.4898151208977847 0.4680045886672951 +procedures 0.31897096270375025 0.6720397779750742 0.6510610214128968 0.47770046732993887 +tests 0.4462420321205438 0.9373531076339365 0.07016212587145565 0.4426813144098273 +prior 0.575157710639188 0.041604380443950095 0.7005046015801906 0.3794406616750823 +orif 0.5785578450730925 0.6764214323420772 0.9087332934144059 0.847878639381547 +femoral 0.45140721159545405 0.049154355127530525 0.998348081507123 0.5229858051295085 +revision 0.5352896062383254 0.34691267252548097 0.5563678248907521 0.24185413432177705 +bolt 0.4906672781562814 0.995769557885563 0.9199675302565811 0.7792201776177682 +progression 0.8064874010211277 0.9492942908417352 0.8031583209102601 0.1369707092204534 +aspect 0.020505630120573937 0.03842610713150807 0.3979937155173646 0.446518745777741 +suspicious 0.23873634308674196 0.49380383226599267 0.07045009148571812 0.6776837646274488 +retroperitoneal 0.9887279437265583 0.11201727874126532 0.10804300544263945 0.22368672095480335 +peripancreatic 0.9189009026653276 0.17884817897455763 0.8444052567122997 0.08909716209366503 +porta 0.36497492766234474 0.1854198260893426 0.8854664192193743 0.8460459911584272 +nodes 0.344551260827069 0.5772760192939855 0.09253816532841797 0.7437442398205429 +causing 0.11220450339359567 0.2617619261434089 0.43952290110549097 0.3573247964815387 +ductile 0.009978029891497875 0.36568035190746995 0.8546282301091936 0.9442502374303855 +post-surgical 0.9819656628026147 0.7830492095932035 0.6813831795188164 0.24000694670245148 +jejunostomy 0.7867016689588306 0.38188692636359123 0.4494163870770739 0.7695321696696273 +gastrostomy 0.6373699689108459 0.34930460655816653 0.23237056158605096 0.8767966152935704 +psychiatric 0.7273827037589827 0.04578488228565902 0.9884915978556879 0.582192833332411 +service 0.8576671013157416 0.912379039544469 0.9288446298550652 0.06585522462352111 +depressive 0.17203910215780216 0.42865613652723156 0.13877349039276898 0.04476462213815613 +mood 0.8283306281808264 0.8545883301173556 0.3901343182386151 0.562466613039094 +months 0.28547710078070365 0.5061593261866291 0.3249973860148754 0.7424598562022217 +coordination 0.7705603393546893 0.612333388212385 0.9659265648607824 0.8697864547212718 +pleuritic 0.4048415137182755 0.38012495358357246 0.8136081433158651 0.7600836510106365 +questionable 0.41410728520483087 0.7548593411241231 0.6295168326097164 0.751332577487149 +ivc 0.8261921093466859 0.5043177124741174 0.3694688507750211 0.8107656674595485 +filter 0.0673278333903532 0.678310097401165 0.6333276131115982 0.18787223138839648 +erect 0.9090118701606226 0.19932242554017765 0.4267906868480724 0.7641668498267548 +decubitus 0.2883729254123234 0.3252347402935247 0.9931011825776613 0.7513626031007924 +views 0.6275538930149474 0.9917096841885173 0.9656904041101019 0.6718177391051229 +nonobstructive 0.4114189867268303 0.589701523274949 0.11717320952692745 0.6186005917025695 +gas 0.9479327372222187 0.5626376579484093 0.5357811675977187 0.6372441332891378 +pattern 0.4318463667857598 0.31817639127741415 0.14723274522901275 0.6972386251238396 +stool-filled 0.6598667618354249 0.3410459782383224 0.11820374782186926 0.12853671929728772 +reason 0.0042720831652748315 0.06632156919236754 0.8269757556632108 0.5868222720945234 +down's 0.5419774458275978 0.026598409659996936 0.7675035825318304 0.5810208579034581 +spinal 0.5573424406732198 0.02187107422074408 0.9650575842771715 0.34750603399227464 +mumps 0.9895522995528745 0.5775300720292387 0.22476035752446544 0.38768255776417804 +varicella 0.7791925454645974 0.7537389245943202 0.80875118396642 0.2991037643218233 +fell 0.48367717493616225 0.44855587414772535 0.9433764326925758 0.6250072296968201 +4 0.9774782329022712 0.6975167298870816 0.8947171110550621 0.32312960103832655 +feet 0.5306446796335202 0.1139961385919962 0.21892996639430518 0.11188708593224728 +injured 0.06788724292752746 0.5375165471978735 0.19329154600421472 0.3904861351176334 +**date[dec 0.950544510060996 0.16127577305011132 0.36449945921819693 0.649253842552232 +stabbing 0.19503555655842242 0.8955152935422422 0.6126987583116003 0.28745555902502107 +shoulders 0.6439423265776435 0.4899532580547955 0.12286556277528515 0.5380926270880416 +hospitalized 0.23965521319048622 0.9863574406503417 0.17959398975620255 0.7626504514300844 +locally 0.21555317198497326 0.40143563528020054 0.7569967032294641 0.7234935185225208 +extensive 0.7159799171984814 0.1505714107908832 0.46234322560548613 0.5675801578861717 +demented 0.08381578095708242 0.6145358333063247 0.5102245067585479 0.16239700165167315 +nursing 0.8132346417550791 0.4241567052918094 0.05344601420359085 0.11348926735985765 +(***path-number[1] 0.955274012531712 0.4920284181606762 0.3775157033171367 0.2626894277719455 +07 0.5851354415076011 0.5556975643572787 0.17750767789813515 0.7171241815817495 +06]) 0.7926717160235761 0.013360291805734503 0.37924596690247725 0.49130483070444453 +raised 0.5413059278978531 0.13095963105809694 0.7411669192618456 0.6580843441593405 +possibility 0.655667180533825 0.27867226941960344 0.77888306159843 0.4989894359914461 +retroflexion 0.14451620898426398 0.715125604359698 0.3913854114358073 0.5204139401586941 +endoscope 0.043073426262920766 0.5792880064843452 0.9359844461249271 0.35938490680262536 +develops 0.9676511871356079 0.17282640621887413 0.39850541380345395 0.5410907895578659 +constitutional 0.6155307187687881 0.31982658650900153 0.7359162665837774 0.770086701639269 +sweats 0.5143137854875324 0.5761241530871622 0.43048163674364537 0.9380683868398754 +(slides 0.42751737811300283 0.32709454749779376 0.7876997916484545 0.3133338928477022 +submitted 0.9569699145747768 0.3606027043434449 0.5346937319480767 0.4405272960380744 +upmc-presbyterian) 0.04760525658629955 0.330914365795665 0.8313915567870523 0.12398644105058332 +rendered 0.2965410406029493 0.6397226814907047 0.8937455798973012 0.49399254640572265 +rheumatologic 0.19586606871487133 0.6520449891571417 0.04640330901894629 0.5624464160683272 +allopurinol 0.6973508762001055 0.8375967529912283 0.17165768392538894 0.8629726494267465 +epicardial 0.7773349248870003 0.38235080437132796 0.20216833916930999 0.7401791966922151 +fat 0.0013322115741865037 0.05034154958397796 0.5168588201157599 0.6037771580109734 +pad 0.44043711074046066 0.592375726524356 0.84256009195635 0.9954912069366773 +femur 0.675400339861336 0.796162608455457 0.5383832599087932 0.6301685152506923 +hemothorax 0.03928467971707039 0.35147799801998214 0.945469603622505 0.6672200348677912 +remains 0.6958991616463781 0.06432836093049554 0.821679841323623 0.34450464419524984 +opacified 0.6287487146603188 0.595682421692557 0.7489477692031824 0.7639935735918671 +knee 0.12402641498989708 0.8231896142083498 0.8004391237183357 0.05702854011059688 +flattening 0.8712138035107031 0.12840610042079525 0.6853188028797612 0.376714142259874 +lateral 0.3761439594810062 0.3974006237669273 0.050844834935752314 0.1659611453171419 +condyle 0.9839210841636024 0.28921404249812677 0.6146712883912081 0.9319241000117073 +hemiarthroplasty 0.44287979161824553 0.3744981548064663 0.32465583830843203 0.3210535371048242 +hardware 0.6058055318263917 0.07576060831050446 0.7991327284656181 0.6337088138585032 +s_o_h 0.5609276380813107 0.9509946219049017 0.8875249401287891 0.043255044094893336 +counters 0.21273450704129904 0.7540734779524187 0.3093341467082191 0.6917937094938386 +record 0.09145064773385869 0.8781416978056654 0.9789608321131829 0.5231530905793076 +subgroup 0.9600531706554797 0.24233594396863822 0.8044708818751516 0.1834913479884076 +classifier 0.6089953194927359 0.41752054756537826 0.4603393330036829 0.5708415406694319 +01tdvtyyejbw 0.08134335922171865 0.80332412689604 0.87736574798995 0.6471547581701151 +ds 0.4548218665289101 0.9666149970257955 0.9673700077695311 0.9398733045320429 +1504 0.22230220511407217 0.8306551934645346 0.4059409759035215 0.7547935413388821 +e_o_h 0.2585574445080936 0.2645238588926875 0.13342497675251275 0.49795367307051874 +[report 0.33864641683901975 0.7640028547564771 0.6025050277981607 0.6615086279408161 +de-identified 0.1820783325776303 0.33796864342963207 0.7001353748200723 0.9670308511343781 +(safe-harbor 0.036948898707212074 0.41843630613996197 0.7022606480823319 0.8794022699062685 +compliant) 0.8895273383454423 0.5977529292034637 0.581662429332575 0.2054393584029206 +de-id 0.4873135582911624 0.2630391917491953 0.48244035393546725 0.7736193643432663 +v 0.4973727465256067 0.7458297475492257 0.1990489820670005 0.2547532374127124 +02] 0.28679495848222225 0.43396430084192383 0.05347557248731882 0.028176267607748717 +cardiology 0.00841503034229385 0.40313293581640663 0.651057880011089 0.6240048049298937 +summary 0.9024685806158028 0.5170808097751161 0.6136057247486617 0.6792336499275112 +**name[aaa 0.9836474889044459 0.6183290589334645 0.5379032613358963 0.533925372046991 +bbb 0.5730585417427 0.3853698737864206 0.10833360483891641 0.5634692590286442 +m] 0.45587726061092415 0.6676459214076466 0.5042814669345875 0.0938642876033996 +account 0.7980283462509674 0.6262251215522772 0.9861330640548457 0.2792071056884663 +# 0.33901276403979685 0.8059340532302725 0.8181072685592984 0.6340423205272876 +**id-num 0.7930415223667755 0.34381416218400873 0.49181544518344045 0.00795928984102734 +**room 0.8369238599193017 0.8791172290573882 0.5513619427782147 0.6657959219626015 +date 0.5406980590148308 0.3716573074554832 0.18896094796650464 0.7940366299818943 +**date[oct 0.60578109110796 0.958188575499314 0.17945955978753692 0.1924683157236985 +staph 0.31545291987208524 0.1805798945951549 0.6528212444601832 0.12988438155351656 +bacteremia 0.6737801675215073 0.4049081396244809 0.6655374277046116 0.34360392068838297 +mfzksi+l8xgn 0.8116230045807248 0.017194449688977542 0.5250994581417174 0.9447992890142759 +5004 0.37215792627424515 0.029358555467461578 0.34607714333328277 0.5551021207270981 +medicine 0.25338867749197835 0.05806865234390379 0.37201380315493093 0.3446842945254165 +**name[zzz 0.7814059712253844 0.2495147188994954 0.18426249550155038 0.767986923031953 +yyy] 0.1156962860607984 0.614735974957815 0.03705131792169469 0.700497080903736 +quite 0.4465885872764317 0.41298396609038335 0.20748050059923429 0.36844845338811016 +frequently 0.3349676328007605 0.6747044282543738 0.42981763933358763 0.9330486749166864 +distention 0.36567050031084103 0.6910728968880476 0.19537154074425045 0.8418127174925053 +obstructions 0.13852302408491113 0.31362041308851296 0.9423893188062262 0.09180837052415058 +ltbbeiqc6gg4 0.5047614879039122 0.20533710737663646 0.9348352939871285 0.36309586743506805 +1509 0.8716237628443634 0.7637434454571792 0.9345746785645435 0.5753713844080597 +death 0.34418362776099776 0.29676999529120895 0.6083983450259922 0.7967991267759325 +xxx 0.4510406711974658 0.1738882555321467 0.9794898957060056 0.5912451830191502 +08] 0.20926748120684258 0.09778711861671785 0.13234990507131073 0.8286007548948373 +cause 0.32405465162237534 0.34599027351867906 0.8108559473315993 0.7026110797465415 +oqsvgutx4kw6 0.3897836021811455 0.8171621592906176 0.51105187950993 0.20160115189214278 +**date[nov 0.0617956566390232 0.9834511596190448 0.16342315644563798 0.5597170339888525 +03 0.3019155722056206 0.39807071156649243 0.5238294214350498 0.2797775666740471 +just 0.9658262618778194 0.23642005562854074 0.9479295456089304 0.16355812407266812 +came 0.1966807208096476 0.4593983865422884 0.3108484257584917 0.411304382841561 +cramp 0.31687619014934165 0.5791674166087022 0.45199885142726715 0.2595008567650705 +164 0.16870485853966055 0.6804691767390624 0.6743858859050864 0.8143954524664067 +nbrrws58gchh 0.14463388453480897 0.3047071374182596 0.5083850103567955 0.7085075841603724 +rad 0.4692573832050396 0.03935059458557333 0.16189985837817122 0.9583246684402308 +xray 0.6475014557598414 0.1718236313808431 0.8219688720506925 0.9336172428418317 +frontal 0.3774526618025613 0.5648916404533363 0.2473046815245411 0.6622809549076598 +view 0.6412193626416288 0.989659851541476 0.8969137644641901 0.7893386786479006 +23 0.1335244436529428 0.5126499874375974 0.17685829064539216 0.3847140229317788 +0628 0.665266890101689 0.1750226361193783 0.2553858830092015 0.45161288370250174 +165 0.6880790060258611 0.12486455028755616 0.29606731303450695 0.425827093254301 +qkoeghjzbxyz 0.625944686676789 0.5675615995771197 0.8892459268099235 0.17600026747785036 +45 0.2712337505340793 0.3846297139225864 0.9240168223431192 0.01587279173798195 +pm 0.10504586560446749 0.037977734846803224 0.13611818531866626 0.021749310901057806 +right-sided 0.035547197442218126 0.4244274708941683 0.09153032409400219 0.5119086503215408 +168 0.7130215514361189 0.674418164200159 0.4212432822764445 0.8894027923282614 +mwixu5insvkz 0.2766468652028107 0.49917372087854006 0.7095667399855069 0.17566806889735598 +1144 0.1155474731116275 0.5822130627421988 0.5565208920362917 0.46959788224344934 +169 0.9854609004087888 0.6839350188126858 0.9842604768981559 0.2552036254276152 +5z75kk5mmhlo 0.612397507800252 0.5821112441693702 0.5381485830797693 0.9507163178574716 +06 0.7359602360476817 0.7064718980987977 0.14501068514284698 0.3429582842353799 +0111 0.39753073918884674 0.3421703657166416 0.03426327510798233 0.5830409619149814 +171 0.16231713579609897 0.5747651821496998 0.802026231447527 0.23292115881824704 +6m8bkoio0pg+ 0.8792980909843126 0.7284583387920008 0.003991099459126346 0.4470748898295759 +1715 0.5293864179520349 0.9096814181509499 0.49973236456151937 0.7495241763331937 +173 0.6896748832445257 0.4097218798710567 0.1393124845100051 0.02367940631242882 +hakynffzxnf0 0.15041585004291902 0.14293376435572247 0.28244551701797915 0.27093545818467724 +thorax 0.2614253946563807 0.001178721095004276 0.07114827816528335 0.413687143065261 +02 0.6558916128261797 0.9904944275352864 0.33872234759353115 0.9194060097337375 +1250 0.10210597954086942 0.2825894971010262 0.5310733491123045 0.3046463630314633 +transplant 0.36742549608359676 0.8616151681656937 0.040789520159626114 0.6413260476502354 +174 0.14381271094512638 0.27576580517665017 0.9100638887271713 0.032353121965115084 +wniafzdwlsyj 0.28058952376053514 0.1136118582147847 0.9454600924966746 0.6496467905161721 +1406 0.7975798378310394 0.21667032860726598 0.9056845945395967 0.3348270775355875 +restrictive 0.410604991226301 0.3125694377467716 0.5729430778207175 0.9513244360829951 +176 0.5913355716127389 0.7072657214739964 0.19912218811022397 0.6463874253641202 +oxqq7hvwf3fi 0.5047913514363918 0.9101235904846352 0.029952358067497076 0.9324421731425774 +1710 0.5203210504683126 0.5822682118263517 0.35347118745759964 0.029485251398960743 +breast 0.37568192532230105 0.9964753727888848 0.793521027395951 0.22366682475490518 +177 0.48887901902979725 0.27596212229796324 0.37467849864720815 0.09194845965049037 +d62idm 0.3587121339599695 0.03979484052143445 0.22192772493559987 0.3186961272543535 +5h0bx 0.47251840715292304 0.7242318387845289 0.6265529776477525 0.4872954678353768 +1331 0.648386907467578 0.7721702137269523 0.609140908579984 0.6683530704142323 +179 0.2791609951933741 0.08696721018543119 0.04124598317470318 0.8457741503261436 ++r8tvtinqnpg 0.5168332438552565 0.6322587353591425 0.8491874574872653 0.22298331064206045 +1013 0.03847650595323249 0.8259987831682899 0.8166096453210062 0.2337819452837151 +jckg4kv7gxim 0.6686442971059228 0.4247952647954173 0.02685194997091922 0.6458121496404774 +202 0.7982122138668216 0.9237190710247466 0.04183860992465349 0.48932371712803535 +jhbto8+z8pgj 0.23592125828246602 0.10711524665515859 0.5011125797585426 0.6699172915719274 +sp 0.8564235680518101 0.2775460594110676 0.9428017972084901 0.031163402983028 +double 0.16741913326901003 0.862847909400511 0.29772028826969843 0.32147347628125056 +203 0.7085119570476934 0.5553852896028945 0.6884704338308609 0.9893257585795148 +ntdzpwjfwmea 0.015702941711790208 0.8729362969820008 0.8773271578472096 0.9819253431753505 +204 0.737975612678615 0.047246714607146245 0.9498483678912504 0.754604403615713 +bmebu46wkgky 0.37622025450286445 0.44413996565618485 0.5955703828225035 0.51174485428207 +bladder 0.02978051847012353 0.2981726613098241 0.6528111569975158 0.2947326236199601 +205 0.4821965720204504 0.40517712060097444 0.4215179602069663 0.9084292809101556 +n0obgvkbcphp 0.9352567910656049 0.5576328928370451 0.6162602295115103 0.8348727930391601 +crohn's 0.7092597510417528 0.9306620021040976 0.730882707462551 0.22139641770857144 +ileocecal 0.2674275416330182 0.3305433948806984 0.01699788637337929 0.7500810479609525 +resection 0.9936995280566562 0.20299391597073413 0.24041024995470017 0.6964326783697531 +6mp 0.255675916675672 0.5650648727154735 0.8727301514264835 0.7674033637657436 +206 0.4182686970754358 0.5723542767193323 0.3998478314099868 0.37235096774306164 +itb 0.6386568260487696 0.2623372622836829 0.9846030662798758 0.8252589434102798 +3uy6yb1f 0.46943199710944206 0.12496772293551339 0.8078998209495305 0.6025141708675427 +gunshot 0.5243595120808764 0.543306607153082 0.030678938441388914 0.1877678539498685 +207 0.7199689619441055 0.8197836361001704 0.4685764049421459 0.2238804756874171 +ky8cwb3wwpyp 0.8962273260410081 0.45780031415146505 0.7517588614645768 0.661678451922477 +bk 0.9408371077581869 0.603734472438957 0.9543414192046656 0.34622722677617657 +plasma 0.8942407540917818 0.0710074151547192 0.4209577252699229 0.013551259530328541 +(ph) 0.8962014951417779 0.16572678462385326 0.7924658619474593 0.9842088637310368 +quantitative 0.24987027503616732 0.61251961686412 0.03843764050278875 0.9348554208380992 +pcr 0.32553719752917243 0.23952152913852276 0.6409932292571053 0.004905991740911109 +208 0.5038048908326043 0.723089862907098 0.9427067326959936 0.15263413915301005 +ichwebxs3zjh 0.8667106604450487 0.615659796908465 0.07072287823811174 0.11352104113210781 +urine 0.8853628451307963 0.08426428695480426 0.7798886862071149 0.5333948895297914 +209 0.9239463420190507 0.2725901383245857 0.019617775672477578 0.8498774814385285 +vxfuaurbuzew 0.9824908401895275 0.9660440055192264 0.022370090555948186 0.2778814557663274 +simultaneous 0.7345849395421759 0.28715386613035065 0.23604259144983264 0.8183782181639248 +pancreas 0.5812577235364268 0.477732904794664 0.47823125970486113 0.5543829823359281 +kidney 0.13853918273812338 0.43458441044359297 0.27723896401352677 0.6240799664196353 +2005] 0.30374911119321824 0.5571278031510862 0.4828670479162551 0.16823750040397023 +lipase 0.8043538785026211 0.6381772956476571 0.879443531996953 0.23753938606755498 +210 0.9232691944978659 0.9832070020831547 0.714203906424577 0.11518499171498475 +2o7o4utvza6n 0.9157759717170479 0.678786448344502 0.76605544753677 0.7610876160895642 +menstrual 0.48575583226870234 0.9474737934869458 0.8229192947751617 0.11622715892528679 +period 0.5727412845376286 0.8243037860840536 0.21025930071343413 0.8191891343742024 +{not 0.616315361606462 0.9691108541117689 0.31906987733186165 0.48164967397238456 +entered} 0.4106838850310318 0.28927765450140064 0.9185101184516422 0.04554145398194265 +post-menopausal 0.639783162024086 0.1891020578062128 0.40653462625632564 0.9018633056464628 +conditions 0.19207456129313383 0.6848268763116065 0.0512861363630972 0.3895717236408839 +routine 0.3942165775635714 0.3284776587575595 0.7066790006905619 0.012720003428977278 +hpv 0.37409449348280943 0.8690063895756662 0.44751608882058314 0.21498649766505773 +requested 0.8662259112147663 0.8458394284977592 0.06264682263454813 0.83296058463282 +ascus 0.9063504434453218 0.857780754399642 0.36908696161492316 0.4130312273349036 +source 0.027050282559900163 0.07932173712801005 0.8656856094685897 0.4214497964352857 +slides 0.76253766611588 0.7067618854601142 0.33183861720767927 0.0031151205339220445 +endocervix 0.49828432372906806 0.4081449007648176 0.5949118867546425 0.3518553774865556 +pap 0.9599569145319996 0.5238672224627611 0.8394514977014877 0.9434582718583465 +thinprep 0.9261618712911122 0.7189171507380219 0.5119228447633271 0.8524270322459646 +00 0.07079158618304471 0.023054390453394302 0.08663353986940803 0.12441677704906917 +x 0.19330327736562725 0.7118412857733516 0.7016602737649874 0.7085319799816657 +screening 0.4790514048773278 0.27928898677136027 0.9480128128389828 0.7671426073027909 +result 0.8468320579962801 0.831982504251021 0.9009332085867443 0.27560296923422056 +adequacy 0.8870889615266261 0.594901405003604 0.8191417967055495 0.7002338753552443 +satisfactory 0.37176764363220094 0.8607532468049315 0.8060056570581494 0.20015512910708388 +endocervical 0.8255951131477216 0.6374891017613037 0.12601993933037292 0.23703233594868567 +transformation 0.18860022542911825 0.4527719859382493 0.8006553138561727 0.6965279579664443 +zone 0.1709797348441764 0.2595204774076967 0.5695402829382656 0.6262239239398578 +absent 0.5039538438188059 0.1532513682368265 0.2961098355378017 0.4770086793523587 +211 0.9240438762220134 0.6597967691084232 0.42126689462781675 0.2983990161984288 +zw1qfzcdk6fp 0.3932442718520587 0.6810727278126244 0.12426682414014578 0.6829931390237708 +redo 0.7538497499472909 0.7969279857719771 0.436948929661798 0.38122800448526784 +surgeries 0.7442376982027338 0.5127027118347309 0.6756037300578759 0.4148405995992236 +broken 0.060694819966076285 0.21497933540565417 0.20417787538635113 0.20590934974830577 +213 0.9404165164689149 0.05210840348078405 0.7231838290284721 0.7080785992652734 +3gq7jjazvbtt 0.5620958583012643 0.6914442862132968 0.9905940505156878 0.7775453548480526 +teens]-year-old 0.4130586205395076 0.9218274058727073 0.12416492351936437 0.05361417886035169 +submucosal 0.6556898116414498 0.36770052569570977 0.4090486915640664 0.4321028485022578 +mandibular 0.44093605817690606 0.16034886254763825 0.8402163347515738 0.8096076959601481 +vestibule 0.0629572050752697 0.27210508573991665 0.466127872525132 0.15261427823211227 +217 0.8265399268717804 0.36096167686228 0.9682512454358702 0.10266550792051776 +k488t 0.5076746932184263 0.39588493521103474 0.8104444629205133 0.2672759506140574 +xrr116 0.32773707354797843 0.03722644977393952 0.722879966906636 0.06600767544875896 +47 0.4330027617582862 0.7319791068672616 0.24710440757066632 0.5020993860061783 +otdccoji2m16 0.97376543001953 0.750226988614551 0.5619907064417117 0.5122671847261068 +echo 0.011115033201077873 0.9369888463762657 0.834753615166244 0.6107722503817661 +intersocietal 0.6444737320422601 0.7827580925654605 0.8968640715550862 0.38857651927731773 +commission 0.9492751872971604 0.5263919024075385 0.13161155272846814 0.05438489849424999 +accreditation 0.541712357004453 0.39560541529520965 0.07822752197226013 0.6115983254793532 +laboratories 0.846268031885773 0.7631771550063587 0.4068980342024574 0.45367902825898776 +certified 0.43178108025313666 0.9510740285117059 0.5996559539210785 0.13015599545445766 +bbb] 0.10309077188764304 0.6500560066977285 0.8840593077225695 0.7864743986197292 +**date[jun 0.787840403144043 0.43892153904507925 0.7097246796019263 0.3131206951727974 +29 0.6418107067130542 0.6854066366683846 0.33219956471526213 0.5529301033346598 +12 0.2111923198670409 0.18985250987248836 0.8743213366473721 0.01021102658841011 +institution 0.9364958284968498 0.2931073562107319 0.5721245192709129 0.38129665842364335 +sex 0.7603371090798681 0.811730089855236 0.7805224167404456 0.8833613226424065 +birth 0.1101242906576505 0.08641547125060367 0.7134027264646883 0.7968148071157057 +1931] 0.06671223346879906 0.6336609693001055 0.8234412238731179 0.8563153376518793 +number 0.10137922664334176 0.3760478230874714 0.18885157863054391 0.12641785883192092 +183991 0.3660195234871434 0.19311042293205116 0.26745319728332917 0.780101071148141 +transthoracic 0.15523495541533572 0.5826190331894012 0.9028535812132213 0.761893235074743 +echocardiogram 0.8240470636663346 0.5512032072128621 0.5232048954620715 0.6620432567147475 +measurements 0.1412411205888896 0.46359421176724125 0.05990958643200228 0.25415995680256176 +*** 0.7858919801022483 0.47136678302029 0.2318129115449986 0.05596969347318459 +obtainable 0.4534106731221139 0.31491179089189225 0.48983504086650276 0.9613178728203916 +59 0.8274903016423502 0.746473085107679 0.09765909843645715 0.46126775060099556 +dade 0.49561813271224275 0.5574332642308869 0.536005604823301 0.22312537121605458 +5sv6eol 0.7383602647396657 0.20963504858826376 0.661188098308928 0.8284470511668764 +32 0.006414432672573356 0.368375635729159 0.8230582815500694 0.28456130631754195 +am 0.8144651078393015 0.21381422562288588 0.4531136516833211 0.5188648966355149 +1927] 0.7271644779748414 0.26788314324339313 0.47045434710077993 0.4411278906514938 +181260 0.09010195623150963 0.6153956504012296 0.22143979249182177 0.31627053518424497 +83 0.16741365417521348 0.8208943649286404 0.5683104200843253 0.12855029182906397 +mmaov8fuczuv 0.38772264207134854 0.5468862036389384 0.1727032031630531 0.33782439560004096 +1005 0.4713896776789849 0.8644609838139461 0.981903850341421 0.9295297806966555 +jr 0.3145930530483496 0.5138928209452323 0.4188432012097052 0.9422488443843382 +**name[vvv 0.6395458536988107 0.9206744301243838 0.8012585199129695 0.7202059993144416 +uuu] 0.5851969512637945 0.6015367292281586 0.0711051619718166 0.9827664313720105 +84 0.6090522143179432 0.7053568034461144 0.002217704731326764 0.42898527705654455 +6trhtqjx2qlx 0.2982908334472658 0.8254599288322271 0.6723840650626948 0.0024819505239954687 +31 0.9471405657604028 0.8926719590157354 0.13540977002268995 0.13039060064868702 +**name[ccc 0.04751579642182224 0.9125803067412389 0.9094141107316945 0.21986475058647348 +ddd 0.16120976081736227 0.5458136147159552 0.10951332550030368 0.44659015898661203 +90 0.9944729118942333 0.04675130796841631 0.7962108776240031 0.4019872422964811 +nuthm0pbxjh7 0.879597560807796 0.873925968178088 0.3852279188105072 0.47926001639701943 +1001 0.2831680521278813 0.36448299005089757 0.7701721544161648 0.9731760740797694 +n 0.40692524656048557 0.2254138221344335 0.14635651333171018 0.7444377681290892 +92 0.42473272272883467 0.2358800375934167 0.392243626234852 0.9567321240864612 +681owhv0zfnk 0.4079211863332385 0.4777978787843917 0.26440602995399576 0.8330229627063404 +93 0.5313270452683717 0.6159535096241684 0.59255586871258 0.5973756091469677 +axpscxmbwx2o 0.7236296319151194 0.23427183623892756 0.12311229579698646 0.7695809884647007 +96 0.5281310063882056 0.0014622617861250742 0.6529386741256816 0.7620725851535849 +wt04woakovpb 0.573911107289746 0.4871172979845244 0.6192954187233497 0.2921061865547957 +20s]-year-old 0.7223303917831724 0.7784405106530047 0.3677191699188167 0.4668989382845563 +me 0.6501405344455758 0.6581329272655632 0.9064520721666106 0.5254008355802682 +97 0.3580036930709851 0.9955675641495415 0.0742098211165606 0.12759832784053748 +fwxvgrdnbcvh 0.8520092297432398 0.17623398398613788 0.6974908059881971 0.7707260905251009 +18 0.03581596635956219 0.8485361928158385 0.49429263536531554 0.38149012795618353 +sss] 0.6723373848124252 0.0023784450622468256 0.19301677102483605 0.38438106665633853 +walker 0.1264261167860461 0.410575658444874 0.5785058036671166 0.5783838662096985 +contain 0.48380283011223824 0.7619761363980092 0.9515718463813555 0.9678664714301474 +granulomas 0.5565140423234389 0.3942860312951326 0.7940390433945165 0.2881560648320366 +schizophrenia 0.23149268767696618 0.3949370252627876 0.26823666888872244 0.3422889825381883 +segmental 0.5963332692133623 0.9799393877243411 0.5480438840986995 0.42264607939614274 +abnormal 0.2851839868526529 0.48351485537370387 0.8780007996515283 0.029662499450760382 +touch 0.5276959817074451 0.9954049739316934 0.6876651254471053 0.8195269971501128 +admits 0.45525955897376535 0.6681024150214581 0.05030704281701437 0.21376271246284995 +while 0.3212963988508155 0.9828538307969983 0.26819974505561617 0.38580785459853617 +cold 0.4261653369394267 0.8111349111165855 0.39117650166713724 0.40319107934663934 +evaluated 0.7022787378885221 0.9373996302280088 0.030789567507976345 0.1486234525627378 +thought 0.11131556190301006 0.8192572105028367 0.9193202903082183 0.2693045865613859 +should 0.2329559777986866 0.752831761896747 0.2810536668536914 0.8109681102805684 +symptomatic 0.6833518665723752 0.2839117566909424 0.23264452331258023 0.3816673776736432 +improvement 0.29158598290817705 0.6595969066860261 0.11006726836249159 0.6600293313026553 +emgs 0.2836626851804799 0.35050300531583445 0.8315543004210397 0.8136807674997669 +irritation 0.07611073070802388 0.19018542629962343 0.14909283125396677 0.9257780282195692 +nerve 0.6650691756904824 0.47629477668865383 0.9154636737391071 0.8410475383591635 +sciatic 0.011765360286564386 0.4570787979067269 0.2994699217182616 0.9374711821192683 +sore 0.2715837541177816 0.9871968232320317 0.2189170090619571 0.20572716361009313 +appeared 0.19146300156437213 0.7784143264165623 0.12907007203650034 0.6757234720505679 +vital 0.5091478810185266 0.593596764300814 0.7037287064092831 0.6680377878012598 +otitis 0.7278618409786375 0.936523415407885 0.8767227210267713 0.5678038770518689 +pharyngitis 0.21125486475091415 0.27539370852845935 0.17354717569283296 0.8195616160615807 +hydrated 0.14602040615279732 0.6044995622540204 0.2560187072958182 0.623813763669147 +became 0.553364299207067 0.1963897087506995 0.12724925028678047 0.3795096836223998 +facial 0.19554760715546649 0.725952410622201 0.499897388170529 0.26140242426953686 +particular 0.9200142358979353 0.5581955708123676 0.4175715826242401 0.4472023228184091 +using 0.7111536286431774 0.8831041674489137 0.3374236844260171 0.4166766014262546 +over-the-counter 0.47853734301056594 0.5538067962305343 0.5313850183255638 0.43332164863883305 +remedies 0.40050793481949964 0.9383844269973639 0.8441614440593483 0.788960045637576 +breathing 0.46328729288121484 0.8310063960866956 0.5580182620137313 0.6425923578233964 +psychosis 0.6883106379672381 0.48467985065980135 0.3575490154437768 0.3170593510829748 +mania 0.42428172736119296 0.12575058708914832 0.1620372411044031 0.7305467825950197 +ocd 0.9954835148380221 0.889394931375327 0.8986837583886708 0.23469254136223028 +habits 0.9441678711555666 0.9756935483847563 0.12688613054866138 0.5070308292998907 +evaluate 0.08238691420540445 0.9674799262474796 0.7919213461870878 0.30273638915028744 +become 0.5774173437772822 0.6740076598431721 0.2857964820689094 0.8091068548055655 +incontinent 0.33496881751404206 0.5685823575955732 0.42369628170237617 0.06594887465845489 +scrape 0.015655890019544816 0.8736554227444604 0.9063707710432432 0.4160097756516197 +ambulate 0.12275216927120425 0.6134784565967272 0.3319699052588556 0.23182298290155945 +lopressor 0.7330349893709551 0.5022692228228691 0.8452464096979964 0.22881260436453377 +b12 0.6075669700811814 0.3649403611566947 0.06771204171647216 0.5192280881994514 +deficiency 0.6155509823198468 0.8977953079610448 0.039246562115510475 0.18745774518423886 +getting 0.631933167012453 0.9681958221460396 0.5198226315497058 0.5744951929677041 +vitamin 0.32821827909155743 0.32505817774034373 0.1494241598941214 0.6478102659657465 +shots 0.5660006628348316 0.5461363413984778 0.6448995007612068 0.8026022617753219 +month 0.42400275815165356 0.28340157087701845 0.4244049994819984 0.19138941144806365 +grip 0.24195279919141222 0.38443020503529013 0.7932585771975357 0.2111159996806855 +resist 0.4484217933589716 0.26556240421212607 0.0994367149025972 0.09043441819162468 +thumb 0.3547793744722181 0.9916963429632023 0.07261292416431919 0.6145459954485838 +fifth 0.5437169202439475 0.8779339312413228 0.8975492312218812 0.9009614104665019 +finger 0.958325544003793 0.06989572397879884 0.4408165189285894 0.7967679437380111 +uti 0.30650554304319455 0.4823741981809089 0.449233619134943 0.906466878599083 +deficit 0.9355229091907935 0.6800960757658362 0.46907931836896977 0.4791718743141584 +insight 0.9028178792303176 0.7697162071173212 0.6970449485715995 0.3849346457193453 +judgment 0.5717089569593131 0.13876652745981188 0.9670598746376026 0.19219172239466786 +ccu 0.6944255506245021 0.9251360355997902 0.6015359648410458 0.9218021447913817 +pavilion 0.609941103983928 0.4297582397376377 0.9112413501904867 0.3978724736573821 +bundle-branch 0.8803676080349335 0.2955350763541984 0.7149274310842997 0.2415425467908595 +block 0.417211648609229 0.1671272550283247 0.36799784325063123 0.4611312254702409 +come 0.9075809275138109 0.7190926390272788 0.43892156567374974 0.2805187630777428 +work 0.14046981789260404 0.3939560574856201 0.35275659738921317 0.31736828611184564 +doing 0.6545405938126533 0.9271453086071865 0.03306485791279701 0.4737657957727335 +standing 0.9813572648330986 0.7523296953388963 0.8166628200246279 0.9340846907962477 +lightheaded 0.6418169036408569 0.6785491310662279 0.6900664001022565 0.47835730272356924 +dizzy 0.9685631510590547 0.8096231415277458 0.2021531380100383 0.914162204116417 +going 0.773455163114752 0.5366810432235256 0.2444779820736075 0.8060132606515679 +pass 0.8567049986760025 0.5762557824894842 0.25884397552505667 0.8384622845774948 +presyncope 0.3630843532025686 0.4492356248239814 0.3311423768284022 0.08760422798610523 +overt 0.030641945736405996 0.5000695396416618 0.06114124752061223 0.19540004358138374 +herniation 0.4486611281262024 0.5409735227760896 0.05981581751476128 0.9422090984279979 +rupture 0.5441261047449963 0.4724867294956253 0.4189772441507077 0.9951496819843602 +coiling 0.6497553620862249 0.7795778863152866 0.5223711763366521 0.6733822267290873 +residuals 0.6716986065072299 0.6316040104810352 0.1230814645185665 0.41613736953403624 +impaction 0.071283388896061 0.4247748575615832 0.6989878381762987 0.2652862291331298 +required 0.6027367216386029 0.9973785149667999 0.7161913124237762 0.08147814716490365 +endoscopy 0.007595276551648267 0.4208744636887439 0.5797346230632272 0.40134680935186784 +taking 0.15655787463255533 0.22406264121820352 0.7316088536443859 0.8130126758977569 +extreme-ly 0.45082222916843806 0.1428882606475682 0.5679522196568701 0.47808589016964353 +vomited 0.6474871439703727 0.00504353672046276 0.5905102105422361 0.9606006879937093 +nifedipine 0.6573907637858568 0.15637978408789255 0.4818409697861762 0.8367889917684976 +dexascan 0.2787943090399796 0.915243665652103 0.5994135029528349 0.6543266586342938 +osteoporotic 0.2766093893926378 0.9012043138840597 0.7206605481695336 0.6856636211548538 +intense 0.6944069851929682 0.7262385887959356 0.7392118142905124 0.5772745910312667 +tolerated 0.7952343388155935 0.45929379184163976 0.4119477275357406 0.43514438281173307 +minimal 0.5673055873317892 0.4792484455285765 0.18717730789700304 0.10652784301953899 +ambulance 0.30192072902575384 0.8166436895627894 0.6061584677955288 0.4190014631988892 +mt 0.3535069539290927 0.851158990504805 0.7633427713985133 0.9200501206270506 +tolerating 0.8254649628969573 0.27965759899281073 0.8351405418361144 0.2239085046986138 +these 0.5905423644205675 0.5987288758433783 0.742666945067421 0.4171043936894594 +feeds 0.7507450534446267 0.9560059924480078 0.6008429440388612 0.1001262757604432 +commands 0.3693366254110698 0.11613908358466563 0.7243896511833452 0.5498653896168829 +sitting 0.06626495925129983 0.4601555579467511 0.6106684505547261 0.17595823479890182 +comfortably 0.8583941747022303 0.39104092355523545 0.4999832219507623 0.9138822788195325 +remained 0.44720906821425954 0.25372489354217886 0.8258754358765387 0.4486344020480949 +experiences 0.009198573107526031 0.8942588962905306 0.11110633982545448 0.19504773689070143 +actually 0.42791326849912636 0.2808234805685015 0.639435277489881 0.7742796443778653 +worse 0.8217316981414529 0.18830444126999424 0.5389665419378106 0.32072955746968934 +point 0.16765873638417372 0.808529458729021 0.749751609603814 0.3296226606154583 +apex 0.7990187189761445 0.2898571159849407 0.7164977036309355 0.9127211275429975 +skull 0.563694783021682 0.6326631452895736 0.5466474522694099 0.26810204804889004 +sensa-tion 0.46960527851078093 0.07136920374179079 0.8047586559440455 0.8896670061616008 +mainly 0.7153968376470295 0.05901185855840663 0.07602705058204606 0.5430254321821602 +quotes 0.10687299920003623 0.2369501988634205 0.9249593787189742 0.3166198363762436 +subsequently 0.9934660806251875 0.3201630400702663 0.8996178169760657 0.12043822668301774 +extremely 0.02409962672819932 0.6576872491476285 0.7875207080934272 0.8493849133249552 +jaundiced 0.21807673681009254 0.33810116860047956 0.41619945717692197 0.05435804284632506 +next 0.0741701144397322 0.10155790738792159 0.9095740969163615 0.37130848181253584 +scleral 0.6998773414408889 0.9927253933715824 0.46072591868857127 0.1470310398422472 +icterus 0.23284479124954616 0.6430914282778185 0.3623075382124783 0.670166242778403 +repair 0.14775242863476845 0.30644356584279264 0.2477468624016207 0.16686654054611305 +intractable 0.39449080418505067 0.2879349252509673 0.23956391455972448 0.25280819414661415 +radicular 0.6972504853138219 0.20499939627920183 0.9755344824645115 0.8467061222627942 +jaundice 0.7591834327686734 0.59748028830215 0.28909809957362975 0.6452159501783284 +palliative 0.5635848664947908 0.7646886305578244 0.6573223228491768 0.13508345498837793 +within 0.4400830417098104 0.3487088106408196 0.39789891790018006 0.7756291193059368 +limits 0.07311359673551876 0.10612071263705625 0.9960620424133296 0.4231310145655982 +documented 0.16999429146664724 0.5144089856379889 0.872982327163959 0.28839657909532346 +vitals 0.07720449289667208 0.34468204421351667 0.3691046502309462 0.7197900998379914 +fluconazole 0.2360137233240983 0.09892425055476317 0.7090620996366102 0.8862091761614214 +sotalol 0.0023916894747197937 0.2578572250364428 0.23921199162974716 0.9979803333133044 +maintain 0.021171913865211778 0.6145867718126851 0.2845040914008713 0.048370084564295035 +hypotensive 0.04073003891764537 0.5795644077553502 0.5583055207973046 0.18438482857196514 +90s 0.7819765762526065 0.2339934149948335 0.17697054813858637 0.7227589258278225 +moving 0.23425347562996368 0.08166108371590164 0.980357685909159 0.5155073901956535 +around 0.5407626914385812 0.9032804339221019 0.04016051296620449 0.8391459809657557 +wheelchair 0.45744358511051386 0.8527764410302823 0.9498328806841893 0.2326578474954002 +accompanied 0.22175615847917052 0.9515527016186297 0.7330527779955919 0.7415128584275502 +husband 0.5516146978976014 0.4346268572061801 0.9820839365211306 0.8385219697706361 +kept 0.40975696511440074 0.9816241739730053 0.14305743759308365 0.8870356279737825 +strict 0.11512181682018607 0.7259702578787379 0.7021103864151655 0.3879719973233946 +npo 0.3603483007347965 0.07722561007489259 0.3944824328605395 0.4093868812510899 +duo-tube 0.7032974364593665 0.8143561497987353 0.4580847802822885 0.5357743030143151 +decompress 0.14053770975393687 0.6118356663389907 0.3837212272677156 0.3940128706374819 +claudication 0.44202367507756934 0.14582709687426898 0.576053394649077 0.3138500025178119 +obese 0.028736729858855448 0.19429200679416914 0.8325680700149415 0.9630420008634472 +nosebleeds 0.30884587477723924 0.6927282760558393 0.5722173555652492 0.8607453931415082 +excessive 0.5529851057075017 0.5363383007805301 0.30106677098857804 0.5174808771337658 +bruising 0.5855339932888912 0.02978111283010576 0.8892008529727687 0.5759678083992978 +tea-colored 0.7963258660828654 0.2772635492665625 0.8512994811411481 0.7343542811129995 +tarry 0.2359980847113522 0.47857731273725557 0.9727121003104253 0.4218435968397829 +black 0.8482375860409797 0.05078990160178187 0.3035217523359036 0.1664451965431838 +newer 0.21346373811863772 0.7306495643487313 0.7550038442475242 0.34555429660718917 +fainting 0.5310299656002138 0.32936533686179714 0.27604990110114325 0.5422911440979842 +ankle 0.9508143897544794 0.6092752524720015 0.8615392943924497 0.04400730445457135 +bloating 0.7801090688608108 0.8310173492494566 0.4503071813020425 0.11409317820984088 +energy 0.5380061318597898 0.8801691813476391 0.10727001709941353 0.5441971628404784 +warm 0.30701848176195157 0.523923885049142 0.752434782724731 0.6511937235179762 +ventilation 0.3404616283802758 0.27122169875945523 0.09893167066973618 0.3217140245992094 +areas 0.42416245669382 0.5138577972577355 0.39287412332747484 0.9267794919150801 +lobar 0.0623493193930319 0.11143873491301592 0.009764416643613938 0.9300550686385418 +consolidation 0.16011768215155986 0.34966126655500307 0.9553265937147297 0.7954656950934522 +hemorrhoidal 0.5086539508046182 0.24923939449464716 0.8430634150921418 0.3633956869253857 +illicit 0.04443936566898732 0.531905940646858 0.9764967969058299 0.20941800963525126 +former 0.5499451103483548 0.1602679808447054 0.7509666483874786 0.6254760895533732 +smoker 0.1908684991869496 0.2598956653273431 0.7080260305499452 0.770668174271506 +packs 0.04374959036650128 0.40487635142754086 0.22488536960221617 0.22726015560341906 +nonsmoker 0.5578460770505792 0.7021675369410527 0.0020217193829439317 0.6672799073225342 +spectral 0.7164340572393738 0.8454328055290958 0.2239841592455153 0.5860223032511311 +lift 0.5152406813049178 0.3256479037226734 0.5115893886156021 0.09102095688919964 +2006 0.2595042130255204 0.2836141260708833 0.55811646018178 0.5485189907652683 +major 0.09619582288134987 0.06861924686104015 0.6477733256330519 0.48419797925823593 +occurred 0.14422672800933045 0.42783650708470977 0.0916218517331121 0.51090077831332 +stenting 0.5630030584635468 0.39265450170916305 0.984312185418782 0.2133100208381612 +targeted 0.9084802116765853 0.35098218990256413 0.584391476703702 0.8580215249654987 +sonography 0.165907249273621 0.4986747425370397 0.05161572433598671 0.5171773065395944 +radiation 0.9197412996486485 0.8485774725508133 0.23660783017333464 0.6948006731649625 +xrt 0.22375025723867248 0.2422868434027714 0.7113690741417693 0.8272012216752065 +uterine 0.8681124295696091 0.35757884834789655 0.2535649289173928 0.0922951130991746 +ca 0.905359169106946 0.5335569863847364 0.5531706440094477 0.8074119891274418 +infiltrate 0.49329727171836735 0.34592312445917794 0.9969755694894387 0.32113667609776064 +discontinued 0.34058551533713355 0.25637660739859547 0.004609965576920505 0.7119602194318844 +current 0.2994301116684921 0.33299811994595574 0.5391713287453045 0.7949916824365484 +would 0.9399015221759764 0.9925767626005911 0.09619176996201817 0.9077083512366658 +duodenal 0.2616656052532583 0.9511898909198332 0.6866022011092914 0.6966986448272149 +postbulbar 0.051459796692478466 0.08594740874336981 0.07251247020176488 0.6595787781839814 +entirely 0.7784350019930162 0.7086630152842284 0.7890415785154679 0.5097405157193199 +inflammation 0.3345226098437515 0.8313175432328832 0.2910952859024136 0.5263142102875306 +barrett's 0.3732750113731408 0.3635297580143545 0.9009031171473805 0.927707527540089 +tailored 0.478167125186743 0.8238528090113095 0.10029047412982095 0.964307192511821 +hepatic 0.9001358477139699 0.06956505923938128 0.11107574080155258 0.8644072449667242 +hepatitis-c 0.14417262655215635 0.7537265174651294 0.6376465382485763 0.9871544999905041 +forearm 0.2560744119541406 0.7036227711973383 0.10768278110569895 0.5372775430804757 +compartments 0.26299627855932983 0.18132049331762157 0.7401590501028061 0.9785930271158771 +gastrojejunostomy 0.8552390045277005 0.03491006397472929 0.29417195890244907 0.8467592422559641 +indication 0.061743361529675855 0.07811197037639639 0.4078031137092585 0.006473165608204323 +personal 0.36805184803137236 0.6462759401914775 0.023141827049895647 0.5242844936145965 +despite 0.4530385370291862 0.16840269622607051 0.5117556662713358 0.2354260727301507 +heartburn 0.4794954487838955 0.7676700707380169 0.7643116546212443 0.7715369827938918 +cirrhosis 0.6637041377516719 0.3570467355070235 0.6887141587747405 0.28588113022878825 +staging 0.6704094559534355 0.4319284058152083 0.27304035040313956 0.3636405450504604 +dyspepsia 0.23215188151104982 0.7783328541988567 0.6127174915984361 0.6370895555862135 +dysphagia 0.27570383128814724 0.6130416318228656 0.07580102569533109 0.43893202848045276 +family 0.7533941744176971 0.2511781012093657 0.08587579955565328 0.42181080143736027 +(father 0.3233948167625289 0.257062801687701 0.8390175170175731 0.4479750803316106 +70's) 5.813508531223865E-4 0.1757886112486774 0.6394150586561904 0.21031765258198454 +adenomatous 0.908369982561231 0.3658103358363417 0.4376687013166223 0.9844708496529276 +stool 0.3019709631639125 0.15694388021091465 0.586442967443708 0.6313765446906962 +perforated 0.10478844783218444 0.5435228331629397 0.7900674234681238 0.470721826770635 +malnutrition 0.3022355655467478 0.8890609766272944 0.7178422167602959 0.9278591407918833 +hypokinetic 0.671022608784777 0.666621524118577 0.7752197271052207 0.9203052971242597 +airspace 0.4870150230890603 0.08819463493921709 0.08269742942865999 0.7946415778806968 +opacities 0.05104765282945778 0.586737215053918 0.7991711266122913 0.2506123542799882 +pneumothorax 0.09961972338190306 0.5061297463093182 0.5446621593756907 0.09644745698984258 +main 0.42618511137811055 0.027157295171993723 0.8267457528766774 0.8679090316114695 +oral 0.32879093773644075 0.31172233428007146 0.055671970917437474 0.31329107101973586 +erythema 0.821106558638223 0.0828497520801168 0.9450849839769411 0.49757560922399924 +exudates 0.8964245827607604 0.37464524042527025 0.8384099504763477 0.811176846810217 +glands 0.5180039101858702 0.984219789649551 0.7753914753568998 0.48218032033609226 +pleurisy 0.4241645324708646 0.23669509159171587 0.918931036529246 0.28571147339964265 +experienced 0.2828324062987946 0.03475579300626208 0.29928523791687656 0.020043938377479642 +calcaneum 0.20777190305589988 0.8058162001817339 0.7008510804726559 0.9762848643169045 +orthopedics 0.15753269658169988 0.882720659044465 0.9161774327022597 0.06695108903792635 +paraesthesias 0.2781764083174173 0.05799831925113441 0.5225073594800657 0.4984761352795455 +worthlessness 0.8221380587109766 0.537044738097395 0.7980688807980952 0.8736651485055166 +self-esteem 0.8781160514405281 0.2822040007307852 0.3884963422790334 0.39234108041965365 +anhedonia 0.7806732497873629 0.7875768149389514 0.8581788155487733 0.3819021921491923 +described 0.7332716161951972 0.653481979357515 0.3169951532512495 0.7342371123438666 +sharp 0.8747509262662116 0.9398490394745124 0.8998106596446976 0.9514210113463843 +one 0.746801813443607 0.1694230384276708 0.9942730945769969 0.40984209699510077 +accidentally 0.878772302317219 0.02451248116373206 0.7618607639769303 0.1408849383820041 +streaking 0.9961857472175104 0.23511413175853568 0.6777992024405499 0.8960233093926613 +coughing 0.18063784102225622 0.8915992100830361 0.5273764959209288 0.5151518787214455 +frustration 0.16127856369208426 0.43174752344473455 0.5393524726894333 0.6492126953491858 +activities 0.8018746557137398 0.8497673626037898 0.1115698598195296 0.7016489089820551 +enjoys 0.7624870292765903 0.2540408166523829 0.8895404711937624 0.44794476209862555 +concentrating 0.26108649781856785 0.7606852957595307 0.795692985172228 0.16848753443191433 +late 0.83797059567455 0.8516243275643018 0.9751332101677456 0.2872133499889441 +feeling 0.03463277754206018 0.748711236124177 0.5231817025538132 0.44697594198865076 +tight 0.9387749440029006 0.05955204079750165 0.29061787903969816 0.6723033474034141 +generalized 0.6184429333411843 0.9360244751205388 0.02447328199908927 0.7711844543058334 +fatigue 0.5708037073838675 0.25309222242664986 0.5953814079901999 0.40975395124148206 +under 0.44487224634421696 0.8633646283397505 0.0869049785598297 0.3572204687656 +problem 0.4625348947124468 0.8119541728038832 0.07792626648022716 0.2431094531666934 +diverticulitis 0.1585053635120599 0.5065349139334079 0.5360991542968973 0.7948691391191397 +cocaine 0.28738815459748557 0.2291072772650662 0.3585267207580527 0.9311137888742007 +appetite 0.0048138566663575855 0.04882588570263735 0.7452416288642557 0.2742559867361304 +30-pound 0.7196492010135768 0.14934845693324728 0.271730721698747 0.6160052701696003 +diff 0.3355741631360213 0.8195158963208484 0.7386593185204454 0.6201877811611385 +count 0.9528576365193079 0.8727745874913465 0.6201147736249415 0.5795677147598802 +stay 0.4849517210572477 0.05860433143362698 0.20610330163419288 0.35682789928982506 +36 0.05956432321572269 0.1083158167714714 0.3406970523202383 0.7345939837662905 +respirations 0.5201582485756635 0.4179342104474817 0.9941439889179126 0.2753622569938051 +oxygenation 0.8213647533673734 0.012486222373026434 0.7754157876077981 0.4494186410488191 +95% 0.6205877043142028 0.3588599041782621 0.8910840372668721 0.22731250679474246 +liters 0.9081144133976327 0.79850797112501 0.9870785884874229 0.5531754537794428 +111 0.08933761758502745 0.8566168110401188 0.7717958120976505 0.11615214093111104 +52 0.5352326006312197 0.45096533964827945 0.3121894058287036 0.619436366397446 +70s 0.25102204841541453 0.07154876429756529 0.4188892829328166 0.30988233497923656 +reports 0.9780009323647711 0.331144660975627 0.5920591160873478 0.8758880324624312 +ptsd 0.7397820686537994 0.8973496320819025 0.586616806948174 0.9472232533284348 +about 0.5717625819553185 0.7082166980464825 0.7736059665568459 0.3377023055703109 +childhood 0.9314004042055062 0.5464753818834652 0.12392100560446229 0.8600775995177988 +occasional 0.0769287869813039 0.5014173674730519 0.8212523052068448 0.01225666208919729 +flashbacks 0.5234263159288707 0.5828827508459857 0.22303179419908936 0.08492218488851233 +reliving 0.5376985822096125 0.7683833814499325 0.1313237685651606 0.27286099413338527 +events 0.5172169047558712 0.062250815926276015 0.11001923884636444 0.1857840824924063 +ongoing 0.013398414181961749 0.7247989323696927 0.8903570601221592 0.7491472000177565 +duplex 0.10580128364642039 0.689841761707716 0.743707950544515 0.8614573557774728 +pseudoaneurysm 0.811347299581479 0.3305095782345213 0.06872279818823646 0.33177734750913046 +avf 0.22147364547824233 0.15615708777668502 0.3393597578931856 0.559617055650924 +region 0.10015438535848997 0.7272597439990792 0.9909693943197773 0.1391270829860497 +therapeutic 0.7470216471297454 0.060545396988231026 0.7835305990877003 0.07048605075741454 +one-time 0.0323548186455046 0.06320995968175469 0.674294582805355 0.5436004269019725 +hydralazine 0.6444963102211698 0.0829180884865024 0.25650799477417274 0.3227116602444978 +could 0.7552901064375883 0.48939009391726374 0.3765504207665177 0.6052320698698175 +longer 0.5054296278034197 0.31372937247771904 0.6287353627667156 0.854107855018614 +take 0.9391395808732664 0.36324500926159986 0.0451883570131828 0.5102335258296701 +himself 0.2764061743174092 0.9023764612346279 0.4476292903416721 0.6799193932666456 +unasyn 0.7010087552113726 0.9444286073496322 0.6085505393852749 0.8621394577832698 +g 0.16389504967436208 0.4341766141005422 0.5811963901885683 0.5312643640515006 +aspiration 0.05398134280118527 0.1180027274692591 0.30091341482824274 0.9232102419844485 +anemi \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index af11d0472c0d39..465fc7425896e0 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -174,12 +174,15 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => /* generate a set of random embeddings from tokens in dataset * rowText is the column containing the text. * returns the path of the file + * + * usage, + * val embeddingsPath = generateRandomEmbeddings(dataset, "sentence", 4) * */ private def generateRandomEmbeddings(dataset: Dataset[Row], rowText: String, dim: Int) = { import org.apache.spark.sql.functions._ import java.io.{PrintWriter, File} val random = scala.util.Random - val filename = s"${rowText}_${random.nextInt(4)}" + val filename = s"${rowText}_${dim}.txt" val pw = new PrintWriter(new File(filename)) val tokens = dataset.toDF().select(col(rowText)). @@ -196,8 +199,6 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => def getAssertionLogregModel(dataset: Dataset[Row]) = { - val embeddingsPath = generateRandomEmbeddings(dataset, "sentence", 4) - val documentAssembler = new DocumentAssembler() .setInputCol("sentence") .setOutputCol("document") @@ -206,10 +207,10 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => .setLabelCol("label") .setInputCols("document") .setOutputCol("assertion") - .setReg(1.0) + .setReg(0.01) .setBefore(11) .setAfter(13) - .setEmbeddingsSource("src/test/resources/ner-corpus/test_embeddings.txt", 3, WordEmbeddingsFormat.Text) + .setEmbeddingsSource("src/test/resources/random_embeddings_dim4.txt", 4, WordEmbeddingsFormat.Text) val pipeline = new Pipeline().setStages(Array(documentAssembler, assertion)).fit(dataset) pipeline diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala index ab742b1c97ac75..324da415208aec 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogregApproachSpec.scala @@ -14,7 +14,9 @@ class AssertionLogregApproachSpec extends FlatSpec { "AssertionLogregApproach" should "be serializable and deserializable correctly" in { logregPipelineModel.write.overwrite.save("./test_assertion_pipeline") val loadedAssertionPipeline = PipelineModel.read.load("./test_assertion_pipeline") - loadedAssertionPipeline.transform(negexDataset) + val predicted = loadedAssertionPipeline.transform(negexDataset) + + assert(negexDataset.count == predicted.count) } From 13cc10c5c630d41eac758e84012ac60f6c63a020 Mon Sep 17 00:00:00 2001 From: Alberto Date: Sat, 27 Jan 2018 14:01:36 -0300 Subject: [PATCH 55/55] removed hard-coded path --- python/example/logreg-assertion/assertion.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb index e57ff3bb38b04e..659685b486f908 100644 --- a/python/example/logreg-assertion/assertion.ipynb +++ b/python/example/logreg-assertion/assertion.ipynb @@ -49,8 +49,8 @@ "source": [ "import time\n", "\n", - "# TODO: fix this hard-coded path\n", - "embeddingsFile = '/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin'\n", + "# download embeddings from 'https://drive.google.com/open?id=0BzMCqpcgEJgiUWs0ZnU0NlFTam8'\n", + "embeddingsFile = 'PubMed-shuffle-win-2.bin'\n", "\n", "documentAssembler = DocumentAssembler()\\\n", " .setInputCol(\"sentence\")\\\n",