Import appropriate modules

In [1]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import time
import zipfile

Start Spark

In [2]:
spark = SparkSession.builder \
    .appName("ner")\
    .master("local[*]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.extraClassPath", "../../lib/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

1. Download CoNLL2003 dataset
2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./

Create annotator components in the right order, with their training Params. Finisher will output only NER. Put all in pipeline.

In [7]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

posTagger = PerceptronModel()\
  .pretrained()\
  .setInputCols(["token", "document"])\
  .setOutputCol("pos")

nerTagger = NerCrfApproach()\
  .setInputCols(["sentence", "token", "pos"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMinEpochs(1)\
  .setMaxEpochs(5)\
  .setLossEps(1e-3)\
  .setEmbeddingsSource("../../data/embeddings/glove.6B.100d.txt", 100, 2)\
  .setExternalFeatures("../../data/ner/dict.txt", ",")\
  .setExternalDataset("../../data/ner/eng.train")\
  .setL2(1)\
  .setC0(1250000)\
  .setRandomSeed(0)\
  .setVerbose(2)

finisher = Finisher() \
    .setInputCols(["ner"]) \
    .setIncludeKeys(True)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    posTagger,
    nerTagger,
    finisher
  ])


Traing the model. Training doesn't really do anything from the dataset itself.

In [9]:
data = spark.createDataFrame([
    ["Peter works at IBM"]
]).toDF("text")

In [8]:
start = time.time()
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")
print (time.time() - start)

Start fitting
Fitting is ended
333.66956210136414


Run the prediction

In [13]:
ner_data = model.transform(data)
ner_data.show(truncate=False)

+------------------+------------------------------------------------------------------------------------------+
|text              |finished_ner                                                                              |
+------------------+------------------------------------------------------------------------------------------+
|Peter works at IBM|word->Peter#result->I-PER@word->works#result->O@word->at#result->O@word->IBM#result->I-ORG|
+------------------+------------------------------------------------------------------------------------------+

