In [1]:
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import zipfile
import os
from pathlib import Path
import urllib.request


In [2]:
spark = SparkSession.builder \
    .appName("ner")\
    .master("local[1]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultS1ize", "2G")\
    .config("spark.jar", "lib/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

1. Download CoNLL2003 dataset
2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./

In [3]:
# Example how to download CoNLL 2003 Dataset
def download_conll2003_file(file):    
    if not Path(file).is_file():
        url = "https://raw.githubusercontent.com/patverga/torch-ner-nlp-from-scratch/master/data/conll2003/" + file
        urllib.request.urlretrieve(url, file)
        
download_conll2003_file("eng.train")
download_conll2003_file("eng.testa")
download_conll2003_file("eng.testb")

3 Download Glove word embeddings

In [4]:
file = "glove.6B.zip"
if not Path("glove.6B.zip").is_file():
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    print("Start downoading Glove Word Embeddings. It will take some time, please wait...")
    urllib.request.urlretrieve(url, "glove.6B.zip")
    print("Downloading finished")
    
if not Path("glove.6B.100d.txt").is_file():
    zip_ref = zipfile.ZipFile(file, 'r')
    zip_ref.extractall("./")
    zip_ref.close()


In [5]:
from pyspark.sql.types import *

class Annotation:
    def __init__(self, annotatorType, begin, end, result, metadata):
        self.annotatorType = annotatorType
        self.begin = begin
        self.end = end
        self.result = result
        self.metadata = metadata

        
annotation_schema = StructType([
    StructField("annotatorType", StringType()),
    StructField("begin", IntegerType(), False),
    StructField("end", IntegerType(), False),
    StructField("result", StringType()),
    StructField("metadata", MapType(StringType(), StringType()))
])
    


def readDataset(file, doc_column = "text", label_column = "label"):
    global spark
    
    result = []
    doc = ""
    labels = []

    with open(file) as f:
        for line in f:
            items = line.split(' ')
            word = items[0]
            if word == "-DOCSTART-":
                result.append((doc, labels))
                doc = ""
                labels = []
            elif len(items) <= 1:
                doc = doc + " \n"
            else:
                if len(doc) > 0:
                    doc = doc + " "

                begin = len(doc)
                doc = doc + word
                end = len(doc) - 1
                ner = items[3]
                labels.append(Annotation("named_entity", begin, end, ner, {}))

    if doc:
        result.append((doc, labels))
    
    global annotation_schema
    
    schema =  StructType([
      StructField(doc_column, StringType()),
      StructField(label_column, ArrayType(annotation_schema))
    ])
    
    
    return spark.createDataFrame(result, schema = schema)

In [6]:
import time

def get_pipeline():
    documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

    sentenceDetector = SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

    tokenizer = Tokenizer()\
      .setInputCols(["document"])\
      .setOutputCol("token")

    nerTagger = NerDLApproach()\
      .setInputCols(["sentence", "token"])\
      .setLabelColumn("label")\
      .setOutputCol("ner")\
      .setMaxEpochs(10)\
      .setExternalDataset("file://" + os.getcwd() + "/eng.train")\
      .setEmbeddingsSource("file://" + os.getcwd() + "/glove.6B.100d.txt", 100, 2)\
      .setRandomSeed(0)\
      .setVerbose(2)

    converter = NerConverter()\
      .setInputCols(["document", "token", "ner"])\
      .setOutputCol("ner_span")
      
    pipeline = Pipeline(
        stages = [
        documentAssembler,
        sentenceDetector,
        tokenizer,
        nerTagger,
        converter
      ])
    
    return pipeline


def train_model(file):
    global spark
    
    print("Dataset Reading")
    
    start = time.time()
    dataset = readDataset(file)
    print("Done, {}\n".format(time.time() - start))

    print("Start fitting")
    pipeline = get_pipeline()

    return pipeline.fit(dataset)

In [15]:
from pyspark.sql.functions import col, udf, explode


def get_dataset_for_analysis(file, model):
    global spark
    
    print("Dataset Reading")
    
    start = time.time()
    dataset = readDataset(file)
    print("Done, {}\n".format(time.time() - start))
    
    predicted = model.transform(dataset)
    
    global annotation_schema
    
    zip_annotations = udf(
      lambda x, y: list(zip(x, y)),
      ArrayType(StructType([
          StructField("predicted", annotation_schema),
          StructField("label", annotation_schema)
      ]))
    )
    
    return predicted\
        .withColumn("result", zip_annotations("ner", "label"))\
        .select(explode("result").alias("result"))\
        .select(
            col("result.predicted").alias("predicted"), 
            col("result.label").alias("label")
        )
        
def printStat(label, correct, predicted, predictedCorrect):
    prec = predictedCorrect / predicted if predicted > 0 else 0
    rec = predictedCorrect / correct if correct > 0 else 0
    f1 = (2*prec*rec)/(prec + rec) if prec + rec > 0 else 0
    
    print("{}\t{}\t{}\t{}".format(label, prec, rec, f1))
        

def test_dataset(file, model, ignore_tokenize_misses=True):
    global spark
    
    started = time.time()

    df = readDataset(file)
    transformed = model.transform(df).select("label", "ner")

    labels = []
    predictedLabels = []

    for line in transformed.collect():
        label = line[0]
        ner = line[1]
        ner = {(a["begin"], a["end"]):a["result"] for a in ner}

        for a in label:
            key = (a["begin"], a["end"])

            label = a["result"].strip()
            predictedLabel = ner.get(key, "O").strip()
            
            if key not in ner and ignore_tokenize_misses:
                continue
                
            labels.append(label)
            predictedLabels.append(predictedLabel)
        

    correct = {}
    predicted = {}
    predictedCorrect = {}


    print(len(labels))

    for (lPredicted, lCorrect) in zip(predictedLabels, labels):
        correct[lCorrect] = correct.get(lCorrect, 0) + 1
        predicted[lPredicted] = predicted.get(lPredicted, 0) + 1

        if lCorrect == lPredicted:
            predictedCorrect[lPredicted] = predictedCorrect.get(lPredicted, 0) + 1

    correct = { key: correct[key] for key in correct.keys() if key != 'O'}
    predicted = { key: predicted[key] for key in predicted.keys() if key != 'O'}
    predictedCorrect = { key: predictedCorrect[key] for key in predictedCorrect.keys() if key != 'O'}

    tags = set(list(correct.keys()) + list(predicted.keys()))

    print("label\tprec\trec\tf1")
    totalCorrect = sum(correct.values())
    totalPredicted = sum(predicted.values())
    totalPredictedCorrect = sum(predictedCorrect.values())

    printStat("Total", totalCorrect, totalPredicted, totalPredictedCorrect)

    for label in tags:
        printStat(label, correct.get(label, 0), predicted.get(label, 0), predictedCorrect.get(label, 0))


In [8]:
import os.path

folder = '.'
train_file = os.path.join(folder, "eng.train")
test_file_a = os.path.join(folder, "eng.testa")
test_file_b = os.path.join(folder, "eng.testb")

In [9]:
model = train_model(train_file)

Dataset Reading
Done, 4.325708627700806

Start fitting


In [16]:
print("\nQuality on training data")
test_dataset(train_file, model)

print("\n\nQuality on validation data")
test_dataset(test_file_a, model)

print("\n\nQuality on test data")
test_dataset(test_file_b, model)


Quality on training data
200019
label	prec	rec	f1
Total	0.711194940641296	0.3797955858391349	0.4951623182248315
B-MISC	0	0.0	0
B-LOC	0	0.0	0
I-MISC	0.6804444444444444	0.34212290502793297	0.45531598513011157
I-LOC	0.7260544815465729	0.40050896752302473	0.5162449234614183
B-ORG	0	0.0	0
I-PER	0.7694149334131378	0.4643727986995394	0.5791845010137419
I-ORG	0.6258533362695442	0.2875645047050491	0.3940654464780921


Quality on validation data
50487
label	prec	rec	f1
Total	0.7347897100857493	0.42271552736669016	0.5366835669549658
B-MISC	0	0.0	0
I-MISC	0.7145061728395061	0.3698083067092652	0.4873684210526315
I-LOC	0.739448051948052	0.4373499759961594	0.5496229260935144
I-PER	0.8011695906432749	0.5262483994878361	0.6352395672333848
I-ORG	0.6014492753623188	0.2832764505119454	0.3851508120649652


Quality on test data
45488
label	prec	rec	f1
Total	0.712688381137579	0.3635912698412698	0.48152405977993107
B-MISC	0	0.0	0
B-LOC	0	0.0	0
I-MISC	0.6385542168674698	0.35333333333333333	0.45493562231759654

In [11]:
df = get_dataset_for_analysis(test_file_a, model)
df.show()

Dataset Reading
Done, 0.6434798240661621

+--------------------+--------------------+
|           predicted|               label|
+--------------------+--------------------+
|[named_entity,3,9...|[named_entity,3,9...|
|[named_entity,11,...|[named_entity,11,...|
|[named_entity,13,...|[named_entity,13,...|
|[named_entity,28,...|[named_entity,28,...|
|[named_entity,33,...|[named_entity,33,...|
|[named_entity,38,...|[named_entity,38,...|
|[named_entity,41,...|[named_entity,41,...|
|[named_entity,45,...|[named_entity,45,...|
|[named_entity,51,...|[named_entity,51,...|
|[named_entity,59,...|[named_entity,59,...|
|[named_entity,67,...|[named_entity,67,...|
|[named_entity,71,...|[named_entity,71,...|
|[named_entity,78,...|[named_entity,78,...|
|[named_entity,91,...|[named_entity,91,...|
|[named_entity,96,...|[named_entity,96,...|
|[named_entity,103...|[named_entity,103...|
|[named_entity,115...|[named_entity,115...|
|[named_entity,120...|[named_entity,120...|
|[named_entity,128...|[named_entit

In [12]:
get_pipeline().write().overwrite().save("./crf_pipeline")
model.write().overwrite().save("./crf_model")

In [13]:
from pyspark.ml import PipelineModel, Pipeline

Pipeline.read().load("./crf_pipeline")
sameModel = PipelineModel.read().load("./crf_model")

In [14]:
print("\nQuality on training data")
test_dataset(train_file, sameModel)

print("\n\nQuality on validation data")
test_dataset(test_file_a, sameModel)

print("\n\nQuality on test data")
test_dataset(test_file_b, sameModel)


Quality on training data
[]
[]
[Row(annotatorType='named_entity', begin=3, end=4, result='I-ORG\n', metadata={}), Row(annotatorType='named_entity', begin=6, end=12, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=14, end=19, result='I-MISC\n', metadata={}), Row(annotatorType='named_entity', begin=21, end=24, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=26, end=27, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=29, end=35, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=37, end=43, result='I-MISC\n', metadata={}), Row(annotatorType='named_entity', begin=45, end=48, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=50, end=50, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=54, end=58, result='I-PER\n', metadata={}), Row(annotatorType='named_entity', begin=60, end=68, result='I-PER\n', metadata={}), Row(annotatorType='named_entity', begin=72, end=79, result='I-LOC\n'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


[Row(annotatorType='named_entity', begin=3, end=6, result='I-LOC\n', metadata={}), Row(annotatorType='named_entity', begin=8, end=15, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=17, end=25, result='I-PER\n', metadata={}), Row(annotatorType='named_entity', begin=27, end=30, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=32, end=34, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=36, end=43, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=45, end=50, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=52, end=52, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=56, end=62, result='I-LOC\n', metadata={}), Row(annotatorType='named_entity', begin=64, end=73, result='O\n', metadata={}), Row(annotatorType='named_entity', begin=77, end=83, result='I-LOC\n', metadata={}), Row(annotatorType='named_entity', begin=85, end=92, result='O\n', metadata={}), Row(annotatorType='named_e

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
