In [1]:
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *


In [2]:
spark = SparkSession.builder \
    .appName("assertion-status")\
    .master("local[2]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.jar", "lib/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [3]:
import time

# TODO: fix this hard-coded path
embeddingsFile = '/home/jose/Downloads/bio_nlp_vec/PubMed-shuffle-win-2.bin'

documentAssembler = DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("document")\

assertion = AssertionLogRegApproach()\
    .setLabelCol("label")\
    .setInputCols(["document"])\
    .setOutputCol("assertion")\
    .setBefore(11)\
    .setAfter(13)\
    .setEmbeddingsSource(embeddingsFile,200,3)


finisher = Finisher() \
    .setInputCols(["assertion"]) \
    .setIncludeKeys(True)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    assertion,
    finisher
  ])


In [4]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("../../../src/test/resources/negex.parquet"). \
        limit(3000)
data.cache()
data.count()
data.show()

+--------------------+--------------------+--------+-----+---+
|            sentence|              target|   label|start|end|
+--------------------+--------------------+--------+-----+---+
|**initials ______...|multinodular goit...|Affirmed|   21| 25|
|02) mild aortic r...|mild aortic regur...|Affirmed|    1|  3|
|02) mild left atr...|mild left atrial ...|Affirmed|    1|  4|
|02) mild left atr...|mild left atrial ...|Affirmed|    1|  4|
|02) mild to moder...|mild to moderate ...|Affirmed|    1|  5|
|02) mild to moder...|mild to moderate ...|Affirmed|    1|  5|
|02) no valvular a...|valvular abnormal...| Negated|    2|  3|
|02) nondilated ri...|nondilated right ...|Affirmed|    1|  9|
|02) normal left v...|normal left ventr...|Affirmed|    1|  4|
|02) normal left v...|normal left ventr...|Affirmed|    1|  6|
|02) paradoxical s...|post-operative se...|Affirmed|    6|  8|
|02) small left ve...|small left ventri...|Affirmed|    1|  8|
|03) mild mitral r...|mild mitral regur...|Affirmed|   

In [5]:
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")

Start fitting
Fitting is ended


In [6]:
result = model.transform(data)
result.select("sentence", "target", "finished_assertion").show()

+--------------------+--------------------+------------------+
|            sentence|              target|finished_assertion|
+--------------------+--------------------+------------------+
|**initials ______...|multinodular goit...|  result->Affirmed|
|02) mild aortic r...|mild aortic regur...|  result->Affirmed|
|02) mild left atr...|mild left atrial ...|  result->Affirmed|
|02) mild left atr...|mild left atrial ...|  result->Affirmed|
|02) mild to moder...|mild to moderate ...|  result->Affirmed|
|02) mild to moder...|mild to moderate ...|  result->Affirmed|
|02) no valvular a...|valvular abnormal...|   result->Negated|
|02) nondilated ri...|nondilated right ...|  result->Affirmed|
|02) normal left v...|normal left ventr...|  result->Affirmed|
|02) normal left v...|normal left ventr...|  result->Affirmed|
|02) paradoxical s...|post-operative se...|  result->Affirmed|
|02) small left ve...|small left ventri...|  result->Affirmed|
|03) mild mitral r...|mild mitral regur...|  result->Af

In [7]:
pipeline.write().overwrite().save("./assertion_pipeline")
model.write().overwrite().save("./assertion_model")

In [8]:
from pyspark.ml import PipelineModel, Pipeline

Pipeline.read().load("./assertion_pipeline")
sameModel = PipelineModel.read().load("./assertion_model")

Py4JJavaError: An error occurred while calling o220.load.
: java.lang.NoSuchMethodException: com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach.read()
	at java.lang.Class.getMethod(Class.java:1786)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:438)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)
	at org.apache.spark.ml.Pipeline$PipelineReader.load(Pipeline.scala:214)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
