In [1]:
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pathlib import Path

if sys.version_info[0] < 3:
    from urllib import urlretrieve
else:
    from urllib.request import urlretrieve


In [2]:
spark = SparkSession.builder \
    .appName("assertion-status")\
    .master("local[2]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.jar", "lib/sparknlp.jar")\
    .getOrCreate()

In [3]:
import time


embeddingsFile = 'PubMed-shuffle-win-2.bin'
embeddingsUrl = 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/PubMed-shuffle-win-2.bin'
# this may take a couple minutes
if not Path(embeddingsFile).is_file():
    urlretrieve(embeddingsUrl, embeddingsFile)

documentAssembler = DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("document")\
    
tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")
    
pretrained_pos = PerceptronModel().pretrained()\
    .setInputCols(["document", "token"])\
    .setOutputCol("pos")
    
pretrained_ner = NerCrfModel().pretrained()\
    .setInputCols(["document", "token", "pos"])\
    .setOutputCol("ner")

assertion = AssertionDLApproach()\
    .setLabelCol("label")\
    .setInputCols(["document"])\
    .setOutputCol("assertion")\
    .setBatchSize(16)\
    .setEpochs(5)\
    .setEmbeddingsSource(embeddingsFile,200,3) \
    .setNerCol("ner")


finisher = Finisher() \
    .setInputCols(["assertion"]) \
    .setIncludeKeys(True)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    tokenizer,
    pretrained_pos,
    pretrained_ner,
    assertion,
    finisher
  ])


In [4]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("../../../src/test/resources/negex.parquet"). \
        limit(3000)
data.cache()
data.count()
data.show()

+--------------------+--------------------+--------+-----+---+
|            sentence|              target|   label|start|end|
+--------------------+--------------------+--------+-----+---+
|**initials ______...|multinodular goit...|Affirmed|   21| 25|
|02) mild aortic r...|mild aortic regur...|Affirmed|    1|  3|
|02) mild left atr...|mild left atrial ...|Affirmed|    1|  4|
|02) mild left atr...|mild left atrial ...|Affirmed|    1|  4|
|02) mild to moder...|mild to moderate ...|Affirmed|    1|  5|
|02) mild to moder...|mild to moderate ...|Affirmed|    1|  5|
|02) no valvular a...|valvular abnormal...| Negated|    2|  3|
|02) nondilated ri...|nondilated right ...|Affirmed|    1|  9|
|02) normal left v...|normal left ventr...|Affirmed|    1|  4|
|02) normal left v...|normal left ventr...|Affirmed|    1|  6|
|02) paradoxical s...|post-operative se...|Affirmed|    6|  8|
|02) small left ve...|small left ventri...|Affirmed|    1|  8|
|03) mild mitral r...|mild mitral regur...|Affirmed|   

In [5]:
start = time.time()
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")
print (time.time() - start)

Start fitting


Exception ignored in: <bound method JavaParams.__del__ of PerceptronModel_4a4b82d309d3f5aa8ba4>
Traceback (most recent call last):
  File "C:\Users\saifa\apps\spark-2.3.0-bin-hadoop2.7\python\pyspark\ml\wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
  File "C:\Users\saifa\apps\spark-2.3.0-bin-hadoop2.7\python\lib\py4j-0.10.6-src.zip\py4j\java_gateway.py", line 1897, in detach
AttributeError: 'NoneType' object has no attribute '_detach'
Exception ignored in: <bound method JavaParams.__del__ of NerCrfModel_4c679af5e56d78814730>
Traceback (most recent call last):
  File "C:\Users\saifa\apps\spark-2.3.0-bin-hadoop2.7\python\pyspark\ml\wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
  File "C:\Users\saifa\apps\spark-2.3.0-bin-hadoop2.7\python\lib\py4j-0.10.6-src.zip\py4j\java_gateway.py", line 1897, in detach
AttributeError: 'NoneType' object has no attribute '_detach'


Fitting is ended
301.67993450164795


In [15]:
model.stages[-1].setInputCols(["assertion", "ner"])

In [26]:
model.transform(spark.createDataFrame([["Hello Peter how are you?"]]).toDF("sentence")).take(1)[0]

Row(sentence='Hello Peter how are you?', _text='Hello Peter how are you?', finished_assertion='result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed', finished_ner='word->Hello#result->I-PER@word->Peter#result->I-PER@word->how#result->O@word->are#result->O@word->you#result->O@word->?#result->O')

In [21]:
result = model.transform(data)
result.select("target", "finished_assertion", "finished_ner").take(10)[4]

Row(target='mild to moderate aortic regurgitation', finished_assertion='result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed@result->Affirmed', finished_ner='word->02#result->O@word->)#result->O@word->mild#result->O@word->to#result->O@word->moderate#result->O@word->aortic#result->O@word->regurgitation#result->O')

In [7]:
pipeline.write().overwrite().save("./assertion_pipeline")
model.write().overwrite().save("./assertion_model")

In [None]:
from pyspark.ml import PipelineModel, Pipeline

Pipeline.read().load("./assertion_pipeline")
sameModel = PipelineModel.read().load("./assertion_model")

In [None]:
sameModel.transform(data).select("sentence", "target", "finished_assertion").show()