In [None]:
! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [None]:
import sparknlp
# let's start Spark with Spark NLP
spark = sparknlp.start()

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testa

In [None]:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, './eng.train')

training_data.show(3)

In [None]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
word2Vec = Doc2VecApproach()\
    .setInputCols("token")\
    .setOutputCol("embeddings")\
    .setMaxSentenceLength(1000)\
    .setStepSize(0.025)\
    .setMinCount(5)\
    .setVectorSize(100)\
    .setNumPartitions(1)\
    .setMaxIter(1)\
    .setSeed(42)\
    .setStorageRef("doc2vec_aclImdb")\

nerTagger = NerDLApproach()\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(5)\
    .setLr(0.003)\
    .setBatchSize(8)\
    .setRandomSeed(0)\
    .setVerbose(1)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setValidationSplit(0.2)\
    .setOutputLogsPath('ner_logs')  # if not set, logs will be written to ~/annotator_logs
#    .setGraphFolder('graphs') >> put your graph file (pb) under this folder if you are using a custom graph generated thru 4.1 NerDL-Graph.ipynb notebook
#    .setEnableMemoryOptimizer() >> if you have a limited memory and a large conll file, you can set this True to train batch by batch

ner_pipeline = Pipeline(stages=[
    word2Vec,
    nerTagger
])


In [None]:
ner_model = ner_pipeline.fit(training_data)

In [None]:
!cd ~/annotator_logs && ls -l

In [None]:
!cat ~/annotator_logs/{sentimentdl.uid}.log

In [None]:
from sparknlp.training import CoNLL

test_data = CoNLL().readDataset(spark, './eng.testa')

test_data.show(3)

In [None]:
predictions = ner_model.transform(test_data)

In [None]:
from sklearn.metrics import classification_report

import pyspark.sql.functions as F

preds_df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
                      .select(F.expr("cols['0']").alias("token"),
                              F.expr("cols['1']").alias("ground_truth"),
                              F.expr("cols['2']").alias("prediction")).toPandas()

print (classification_report(preds_df['ground_truth'], preds_df['prediction']))

## Save and Restore
### Pipeline Model

It's pretty simple to save and restore an already trained Pipeline which is called `PipelineModel`:

In [None]:
# this is our PipelineModel after it was trained via .fit()
# as you can see we have all the stages inside this PipelineModel
ner_model.stages
# so once you save it on disk, it will include everything next time you load it!

In [None]:
ner_model.write().overwrite().save("./ner_conll03_word2vec_pipeline")

In [None]:
# let's load it back and try
loadedPipelineModel = PipelineModel.load("./ner_conll03_word2vec_pipeline")
loadedPipelineModel.stages
# we have all of our stages inside the loaded pipeline!

In [None]:
# you can use it with Spark NLP LightPipeline 
lp_loadedPipeline = LightPipeline(loadedPipelineModel)

lp_loadedPipeline.annotate("My name is John and I am a Doctor in London!")

In [None]:
# or you can use it via DataFrame
from pyspark.sql.types import StringType

dfTest = spark.createDataFrame([
    "This movie is a delight for those of all ages. I have seen it several times and each time I am enchanted by the characters and magic. The cast is outstanding, the special effects delightful, everything most believable.",
    "This film was to put it simply rubbish. The child actors couldn't act, as can be seen by Harry's supposed surprise on learning he's a wizard. I'm a wizard! is said with such indifference you'd think he's not surprised at all."
], StringType()).toDF("text")

loadedPipelineModel\
  .transform(dfTest)\
  .select("class.result")\
  .show(2, False)

### Annotator Models
Now let's say you would like to only save the trained annotators inside your pipeline so you can load them inside another custom Pipeline

In [None]:
# all we need is to access that stage and save it on disk
ner_model.stages

In [None]:
print(ner_model.stages[-1])
print(ner_model.stages[-2])

In [None]:
# let's save our ClassifierDL - let's mention it was trained by doc2vec_aclImdb as well
ner_model.stages[-1].write().overwrite().save("./nerdl_conll03_word2vec_model")

In [None]:
# and here is our trained Doc2VecModel
ner_model.stages[-2].write().overwrite().save("./word2vec_conll03_model")