Show how to use pretrained assertion status

In [1]:
import sys

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.pretrained import ResourceDownloader

from pathlib import Path

if sys.version_info[0] < 3:
    from urllib import urlretrieve
else:
    from urllib.request import urlretrieve

In [2]:
spark = sparknlp.start()


print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

Spark NLP version
2.2.1
Apache Spark version


'2.4.3'

Create some data for testing purposes

In [3]:
from pyspark.sql import Row
R = Row('sentence', 'start', 'end')
test_data = spark.createDataFrame([R('Peter is a good person, and he was working at IBM',0,1)])

Create a custom pipeline

In [4]:
import time

documentAssembler = DocumentAssembler() \
    .setInputCol("sentence") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

spell = NorvigSweetingModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("spell")

embeddings = WordEmbeddingsModel.pretrained() \
    .setInputCols(["document", "token"]) \

ner_dl = NerDLModel().pretrained() \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner_dl")

finisher = Finisher() \
    .setInputCols(["ner_dl", "lemma", "spell"]) \
    .setIncludeMetadata(True)

pipeline_fast_dl = Pipeline(stages = [
    documentAssembler, 
    tokenizer, 
    lemmatizer, 
    spell, 
    embeddings, 
    ner_dl, 
    finisher])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 144.3 MB
[OK!]
ner_dl_contrib download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


Now let's use these pipelines and see the results

In [5]:
pipeline_fast_dl.fit(test_data).transform(test_data).show(truncate=False)

+-------------------------------------------------+-----+---+--------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|sentence                                         |start|end|finished_ner_dl                             |finished_lemma                                             |finished_spell                   