![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/word2vec/Train_Word2Vec_and_Named_Entity_Recognition.ipynb)

# Train NER Model with Word2Vec Embeddings

In [None]:
# Only run this Cell when you are using Spark NLP on Google Colab
! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [None]:
import sparknlp
# let's start Spark with Spark NLP
spark = sparknlp.start()

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testa

In [None]:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, './eng.train')

training_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|EU rejects German...|[{document, 0, 47...|[{document, 0, 47...|[{token, 0, 1, EU...|[{pos, 0, 1, NNP,...|[{named_entity, 0...|
|     Peter Blackburn|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 4, Pe...|[{pos, 0, 4, NNP,...|[{named_entity, 0...|
| BRUSSELS 1996-08-22|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 7, BR...|[{pos, 0, 7, NNP,...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


In [None]:
word2Vec = Word2VecApproach()\
    .setInputCols("token")\
    .setOutputCol("embeddings")\
    .setMaxSentenceLength(1000)\
    .setStepSize(0.025)\
    .setMinCount(5)\
    .setVectorSize(100)\
    .setNumPartitions(1)\
    .setMaxIter(1)\
    .setSeed(42)\
    .setStorageRef("word2vec_conll03")\

nerTagger = NerDLApproach()\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(1)\
    .setLr(0.003)\
    .setBatchSize(8)\
    .setRandomSeed(0)\
    .setVerbose(1)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setValidationSplit(0.2)\
    .setOutputLogsPath('ner_logs')  # if not set, logs will be written to ~/annotator_logs
#    .setGraphFolder('graphs') >> put your graph file (pb) under this folder if you are using a custom graph generated thru 4.1 NerDL-Graph.ipynb notebook
#    .setEnableMemoryOptimizer() >> if you have a limited memory and a large conll file, you can set this True to train batch by batch

ner_pipeline = Pipeline(stages=[
    word2Vec,
    nerTagger
])


In [None]:
ner_model = ner_pipeline.fit(training_data)

In [None]:
!cd ./ner_logs && ls -l

total 4
-rw-r--r-- 1 root root 1017 20. Feb 18:09 NerDLApproach_00802da54a15.log


In [None]:
!cat ./ner_logs/{nerTagger.uid}.log

Name of the selected graph: ner-dl/blstm_10_100_128_120.pb
Training started - total epochs: 1 - lr: 0.003 - batch size: 8 - labels: 9 - chars: 84 - training examples: 11239


Epoch 1/1 started, lr: 0.003, dataset size: 11239


Epoch 1/1 - 48.29s - loss: 4617.783 - batches: 1407
Quality on validation dataset (20.0%), validation examples = 2247
time to finish evaluation: 3.36s
label	 tp	 fp	 fn	 prec	 rec	 f1
B-LOC	 1029	 145	 376	 0.87649065	 0.7323843	 0.7979837
I-ORG	 381	 93	 363	 0.8037975	 0.51209676	 0.6256157
I-MISC	 124	 121	 118	 0.50612247	 0.5123967	 0.50924027
I-LOC	 138	 53	 77	 0.7225131	 0.6418605	 0.67980295
I-PER	 898	 237	 21	 0.79118943	 0.97714907	 0.8743915
B-MISC	 528	 192	 190	 0.73333335	 0.73537606	 0.73435324
B-ORG	 771	 139	 507	 0.8472527	 0.6032864	 0.7047532
B-PER	 1200	 630	 111	 0.6557377	 0.9153318	 0.7640879
tp: 5069 fp: 1610 fn: 1763 labels: 8
Macro-average	 prec: 0.74205464, rec: 0.7037352, f1: 0.7223871
Micro-average	 prec: 0.75894594, rec: 0.7419497

In [None]:
from sparknlp.training import CoNLL

test_data = CoNLL().readDataset(spark, './eng.testa')

test_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|CRICKET - LEICEST...|[{document, 0, 64...|[{document, 0, 64...|[{token, 0, 6, CR...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|
|   LONDON 1996-08-30|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 5, LO...|[{pos, 0, 5, NNP,...|[{named_entity, 0...|
|West Indian all-r...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 3, We...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
predictions = ner_model.transform(test_data)

In [None]:
import pyspark.sql.functions as F

from sklearn.metrics import classification_report

preds_df = predictions.select(F.explode(F.arrays_zip(predictions.token.result,
                                                     predictions.label.result,
                                                     predictions.ner.result)).alias("cols")) \
                      .select(F.expr("cols['0']").alias("token"),
                              F.expr("cols['1']").alias("ground_truth"),
                              F.expr("cols['2']").alias("prediction")).toPandas()

print (classification_report(preds_df['ground_truth'], preds_df['prediction']))

              precision    recall  f1-score   support

       B-LOC       0.88      0.71      0.79      1837
      B-MISC       0.77      0.74      0.76       922
       B-ORG       0.85      0.58      0.69      1341
       B-PER       0.66      0.93      0.78      1842
       I-LOC       0.70      0.53      0.60       257
      I-MISC       0.62      0.59      0.60       346
       I-ORG       0.83      0.42      0.56       751
       I-PER       0.80      0.96      0.87      1307
           O       0.99      0.99      0.99     42759

    accuracy                           0.95     51362
   macro avg       0.79      0.72      0.74     51362
weighted avg       0.95      0.95      0.95     51362



## Save and Restore


### Annotator Models
Let's say you would like to only save the trained annotators inside your pipeline so you can load them inside another custom Pipeline

In [None]:
# all we need is to access that stage and save it on disk
ner_model.stages

[Word2VecModel_2ebfbb8d7c3b, NerDLModel_c57cffac70ba]

In [None]:
print(ner_model.stages[-1])
print(ner_model.stages[-2])

NerDLModel_c57cffac70ba
Word2VecModel_2ebfbb8d7c3b


In [None]:
# let's save our NerDLModel - let's mention it was trained by word2vec_conll03 as well
ner_model.stages[-1].write().overwrite().save("./nerdl_conll03_word2vec_model")

In [None]:
# and here is our trained Word2VecModel
ner_model.stages[-2].write().overwrite().save("./word2vec_conll03_model")

This is how you use your saved model within your pipeline

In [None]:
document = DocumentAssembler().setInputCol("text").setOutputCol("document")

sentence = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

token = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

word2vecModel = (
    Word2VecModel.load("./word2vec_conll03_model")
    .setInputCols("token")
    .setOutputCol("embeddings")
)

nerdlModel = (
    NerDLModel.load("./nerdl_conll03_word2vec_model")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner")
)

ner_prediction_pipeline = Pipeline(
    stages=[document, sentence, token, word2vecModel, nerdlModel]
)


In [None]:
# or you can use it via DataFrame
from pyspark.sql.types import StringType

dfTest = spark.createDataFrame([
    "My name is John and I am a Doctor in London!",
    "Peter Parker is a nice persn and lives in New York. Bruce Wayne is also a nice guy and lives in Gotham city."
], StringType()).toDF("text")

ner_prediction_pipeline\
  .fit(dfTest)\
  .transform(dfTest)\
  .select("ner.result")\
  .show(2, False)

+-------------------------------------------------------------------------------------------------------+
|result                                                                                                 |
+-------------------------------------------------------------------------------------------------------+
|[O, O, O, B-PER, O, O, O, O, B-ORG, O, B-LOC, O]                                                       |
|[B-PER, I-PER, O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-PER, I-PER, O, O, O, O, O, O, O, O, B-LOC, O, O]|
+-------------------------------------------------------------------------------------------------------+



### Save and restore the whole Pipeline

In [None]:

ner_prediction_pipeline.write().overwrite().save("./ner_conll03_word2vec_pipeline")
# let's load it back and try
loadedPipeline = Pipeline.load("./ner_conll03_word2vec_pipeline")
loadedPipeline.stages
# we have all of our stages inside the loaded pipeline!

Param(parent='Pipeline_704aa7f63c6f', name='stages', doc='a list of pipeline stages')

In [None]:
loadedPipeline\
  .fit(dfTest)\
  .transform(dfTest)\
  .select("ner.result")\
  .show(2, False)

+-------------------------------------------------------------------------------------------------------+
|result                                                                                                 |
+-------------------------------------------------------------------------------------------------------+
|[O, O, O, B-PER, O, O, O, O, B-ORG, O, B-LOC, O]                                                       |
|[B-PER, I-PER, O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-PER, I-PER, O, O, O, O, O, O, O, O, B-LOC, O, O]|
+-------------------------------------------------------------------------------------------------------+

