Compute necessary imports

In [1]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import time
import zipfile

Initialize Spark, if not already in a pyspark environment

In [2]:
spark = SparkSession.builder \
    .appName("ner")\
    .master("local[*]")\
    .config("spark.driver.memory","6G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.driver.extraClassPath", "../../lib/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

Create annotator components with appropriate params and in the right order. The finisher will output only NER. Put everything in Pipeline

In [8]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

nerTagger = NerDLModel()\
  .pretrained()\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("ner")

converter = NerConverter()\
  .setInputCols(["document", "token", "ner"])\
  .setOutputCol("ner_span")
    
finisher = Finisher() \
    .setInputCols(["ner_span"]) \
    .setIncludeKeys(True)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    nerTagger,
    converter,
    finisher
  ])


Load parquet dataset and cache into memory

In [9]:
#Load the input data to be annotated
data = spark.createDataFrame([
    ["Peter was working on IBM last year."]
]).toDF("text")

Train the pipeline

In [10]:
start = time.time()
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")
print (time.time() - start)

Start fitting
Fitting is ended
1117.2223477363586


Lets predict with the model

In [12]:
model.transform(data).show(truncate=False)

+-----------------------------------+-------------------------------------------------+
|text                               |finished_ner_span                                |
+-----------------------------------+-------------------------------------------------+
|Peter was working on IBM last year.|entity->PER#result->Peter@entity->ORG#result->IBM|
+-----------------------------------+-------------------------------------------------+

