![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/models_hub/Train_a_Spark_NLP_Model.ipynb)

In [1]:
%%capture
 
# Setup Spark NLP on Colab
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/colab_setup.sh -O - | bash

In [2]:
import sparknlp
 
spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
 
print("Spark NLP version", sparknlp.version())
 
print("Apache Spark version:", spark.version)

Spark NLP version 3.1.0
Apache Spark version: 3.0.2


In [3]:
#download training data
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testa

In [4]:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, './eng.train')
testing_data= CoNLL().readDataset(spark, './eng.testa')

In [5]:
import pyspark.sql.functions as F

training_data.select(F.explode(F.arrays_zip('token.result', 'pos.result',  'label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("pos"),
        F.expr("cols['2']").alias("ner_label")).show(truncate=50)

+----------+---+---------+
|     token|pos|ner_label|
+----------+---+---------+
|        EU|NNP|    B-ORG|
|   rejects|VBZ|        O|
|    German| JJ|   B-MISC|
|      call| NN|        O|
|        to| TO|        O|
|   boycott| VB|        O|
|   British| JJ|   B-MISC|
|      lamb| NN|        O|
|         .|  .|        O|
|     Peter|NNP|    B-PER|
| Blackburn|NNP|    I-PER|
|  BRUSSELS|NNP|    B-LOC|
|1996-08-22| CD|        O|
|       The| DT|        O|
|  European|NNP|    B-ORG|
|Commission|NNP|    I-ORG|
|      said|VBD|        O|
|        on| IN|        O|
|  Thursday|NNP|        O|
|        it|PRP|        O|
+----------+---+---------+
only showing top 20 rows



## 1. Create Spark NLP train pipeline

In [6]:
!mkdir ner_logs

mkdir: cannot create directory ‘ner_logs’: File exists


In [7]:
# You can use any word embeddings you want (Glove, Elmo, Bert, custom etc.)

embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

nerTagger = NerDLApproach()\
      .setInputCols(["sentence", "token", "embeddings"])\
      .setLabelColumn("label")\
      .setOutputCol("ner")\
      .setMaxEpochs(1)\
      .setLr(0.003)\
      .setBatchSize(32)\
      .setRandomSeed(0)\
      .setVerbose(1)\
      .setValidationSplit(0.2)\
      .setEvaluationLogExtended(True) \
      .setEnableOutputLogs(True)\
      .setIncludeConfidence(True)\
      .setOutputLogsPath('ner_logs') # if not set, logs will be written to ~/annotator_logs
 #    .setGraphFolder('graphs') >> put your graph file (pb) under this folder if you are using a custom graph generated thru 4.1 NerDL-Graph.ipynb notebook
 #    .setEnableMemoryOptimizer() >> if you have a limited memory and a large conll file, you can set this True to train batch by batch 
    
ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

ner_pipeline = Pipeline(stages=[
      embeddings,
      nerTagger,
      ner_converter
 ])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


## 2. Train model

In [8]:
%%time
ner_model = ner_pipeline.fit(training_data)
ner_model.stages[-1].write().overwrite().save('outputs/ner_wiki_glove100d_en')

CPU times: user 1.01 s, sys: 131 ms, total: 1.14 s
Wall time: 3min 8s


In [9]:
import pyspark.sql.functions as F
predictions = ner_model.transform(testing_data) 
predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
           .select(F.expr("cols['0']").alias("token"),
                   F.expr("cols['1']").alias("ground_truth"),
                   F.expr("cols['2']").alias("prediction")).show(truncate=False)

+--------------+------------+----------+
|token         |ground_truth|prediction|
+--------------+------------+----------+
|CRICKET       |O           |O         |
|-             |O           |O         |
|LEICESTERSHIRE|B-ORG       |B-ORG     |
|TAKE          |O           |O         |
|OVER          |O           |O         |
|AT            |O           |O         |
|TOP           |O           |O         |
|AFTER         |O           |O         |
|INNINGS       |O           |O         |
|VICTORY       |O           |O         |
|.             |O           |O         |
|LONDON        |B-LOC       |B-LOC     |
|1996-08-30    |O           |O         |
|West          |B-MISC      |B-MISC    |
|Indian        |I-MISC      |B-MISC    |
|all-rounder   |O           |O         |
|Phil          |B-PER       |B-PER     |
|Simmons       |I-PER       |I-PER     |
|took          |O           |O         |
|four          |O           |O         |
+--------------+------------+----------+
only showing top

## 3. Benchmark

In [10]:
from sklearn.metrics import classification_report
 
preds_df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
                      .select(F.expr("cols['0']").alias("token"),
                              F.expr("cols['1']").alias("ground_truth"),
                              F.expr("cols['2']").alias("prediction")).toPandas()
 
print(classification_report(preds_df['ground_truth'], preds_df['prediction']))

              precision    recall  f1-score   support

       B-LOC       0.93      0.91      0.92      1837
      B-MISC       0.79      0.87      0.83       922
       B-ORG       0.88      0.81      0.84      1341
       B-PER       0.93      0.96      0.95      1842
       I-LOC       0.93      0.68      0.78       257
      I-MISC       0.78      0.61      0.68       346
       I-ORG       0.86      0.70      0.77       751
       I-PER       0.96      0.97      0.96      1307
           O       0.99      1.00      0.99     42759

    accuracy                           0.98     51362
   macro avg       0.89      0.83      0.86     51362
weighted avg       0.98      0.98      0.98     51362



## 4. Saving model and Zipping it

In [12]:
import shutil

shutil.make_archive("/content/outputs/ner_wiki_glove100d_en", 'zip', "/content/outputs/ner_wiki_glove100d_en")

'/content/outputs/ner_wiki_glove100d_en.zip'