In [30]:
import json
import os

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_271.jdk/Contents/Home"

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import pandas as pd

params = {"spark.driver.memory":"4G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp.start()
spark



In [31]:
%reload_ext autoreload
%autoreload 2

In [32]:
documentAssembler = DocumentAssembler()\
		.setInputCol("text")\
		.setOutputCol("document")

tokenizer = Tokenizer()\
		.setInputCols(["document"])\
		.setOutputCol("token")
	
embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
		.setInputCols(["document", 'token']) \
		.setOutputCol("embeddings")

ner = NerDLModel.pretrained("ner_dl", 'en') \
		.setInputCols(["document", "token", "embeddings"]) \
		.setOutputCol("ner")

ner_pipeline = Pipeline().setStages([
				documentAssembler,
				tokenizer,
				embeddings,
				ner
    ])

ner_model_pipeline = ner_pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [33]:
from nlptest.nlptest import Harness

h = Harness("ner", model=ner_model_pipeline, data="data/test.conll", config = 'data/config.yml')

In [34]:
model_handler = h.model.model_class

In [35]:
# always ModelFactory
type(h.model)

nlptest.nlptest.modelhandler.modelhandler.ModelFactory

In [36]:
# library specific ModelHandler subclass
type(model_handler)

nlptest.nlptest.modelhandler.jsl_modelhandler.PretrainedModelForNER

In [37]:
# spark models based on LightPipeline
model_handler.model

<sparknlp.base.light_pipeline.LightPipeline at 0x7f9474452430>

In [38]:
# predict method is overloaded. returns NEROutput
ner_output = model_handler.predict('Apple is a technology company founded in San Fransisco')

In [39]:
print(" ".join([pred.span.word for pred in ner_output.predictions]))
print(" ".join([pred.entity for pred in ner_output.predictions]))

Apple is a technology company founded in San Fransisco
B-ORG O O O O O O B-LOC I-LOC


In [40]:
vars(ner_output.predictions[0])

{'entity': 'B-ORG',
 'span': <Span(start=0, end=4, word='Apple')>,
 'score': 0.9934}

In [None]:
ner_model_pipeline.save('./saved_spark_pipeline')

In [42]:
# load_model supports johnsnowlabs NLU inference and Models Hub
h = Harness(
    "ner",
    hub='johnsnowlabs',
    model='./saved_spark_pipeline',
    data="data/test.conll",
    config = 'data/config.yml'
)



2023-03-02 10:52:35.330564: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
AddV2: CPU 
AssignSub: CPU 
RealDiv: CPU 
Shape: CPU 
Unique: CPU 
Cast: CPU 
UnsortedSegmentSum: CPU 
Add: CPU 
GatherV2: CPU 
StridedSlice: CPU 
Identity: CPU 
Fill: CPU 
NoOp: CPU 
RandomUniform: CPU 
Mul: CPU 
Sub: CPU 
Sqrt: CPU 
Assign: CPU 
VariableV2: CPU 
Scatte

In [43]:
model_handler = h.model.model_class
model_handler

<nlptest.nlptest.modelhandler.jsl_modelhandler.PretrainedModelForNER at 0x7f9473238070>

In [44]:
# predict method is overloaded. returns NEROutput
ner_output = model_handler.predict('Apple is a technology company founded in San Fransisco')

In [45]:
print(" ".join([pred.span.word for pred in ner_output.predictions]))
print(" ".join([pred.entity for pred in ner_output.predictions]))

Apple is a technology company founded in San Fransisco
B-ORG O O O O O O B-LOC I-LOC


In [46]:
from johnsnowlabs import nlp

In [48]:
jsl_model = nlp.load('ner.dl')
# load_model supports johnsnowlabs NLU inference and Models Hub
h = Harness(
    "ner",
    hub='johnsnowlabs',
    model='./saved_spark_pipeline',
    data="data/test.conll",
    config = 'data/config.yml'
)

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[ \ ]recognize_entities_dl download started this may take some time.
Approximate size to download 160.1 MB
Download done! Loading the resource.
[ / ]

2023-03-02 10:53:19.396468: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
AddV2: CPU 
AssignSub: CPU 
RealDiv: CPU 
Shape: CPU 
Unique: CPU 
Cast: CPU 
UnsortedSegmentSum: CPU 
Add: CPU 
GatherV2: CPU 
StridedSlice: CPU 
Identity: CPU 
Fill: CPU 
NoOp: CPU 
RandomUniform: CPU 
Mul: CPU 
Sub: CPU 
Sqrt: CPU 
Assign: CPU 
VariableV2: CPU 
Scatte

[OK!]


2023-03-02 10:53:25.677024: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
AddV2: CPU 
AssignSub: CPU 
RealDiv: CPU 
Shape: CPU 
Unique: CPU 
Cast: CPU 
UnsortedSegmentSum: CPU 
Add: CPU 
GatherV2: CPU 
StridedSlice: CPU 
Identity: CPU 
Fill: CPU 
NoOp: CPU 
RandomUniform: CPU 
Mul: CPU 
Sub: CPU 
Sqrt: CPU 
Assign: CPU 
VariableV2: CPU 
Scatte