![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# 3. Spark NLP Pretrained Models

Spark NLP offers the following pre-trained models in five languages (English, French, German, Italian, Russia) and all you need to do is to load the pre-trained model into your disk by specifying the model name and then configuring the model parameters as per your use case and dataset. Then you will not need to worry about training a new model from scratch and will be able to enjoy the pre-trained SOTA algorithms directly applied to your own data with transform().

In the official documentation, you can find detailed information regarding how these models are trained by using which algorithms and datasets.

https://github.com/JohnSnowLabs/spark-nlp-models

In [4]:
import sparknlp


from sparknlp.base import *
from sparknlp.annotator import *

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

## LemmatizerModel

In [6]:
!wget -O news_category_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv

In [7]:
%fs ls "file:/databricks/driver"

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/ganglia/,ganglia/,4096
file:/databricks/driver/news_category_test.csv,news_category_test.csv,1504408
file:/databricks/driver/derby.log,derby.log,726


In [8]:
dbutils.fs.cp("file:/databricks/driver/news_category_test.csv", "dbfs:/")

In [9]:
import pyspark.sql.functions as F

news_df = spark.read\
                .option("header", "true")\
                .csv("news_category_test.csv")\
                .withColumnRenamed("description", "text")

news_df.show(truncate=50)

In [10]:
lemmatizer = LemmatizerModel.pretrained('lemma_antbnc', 'en') \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \


'''
lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")
'''

In [11]:
!cd ~/cache_pretrained && ls -l


In [12]:
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)


In [13]:
result = pipelineModel.transform(news_df)

result.show(5)

In [14]:
result.select('token.result','lemma.result').show(5, truncate=100)

## PerceptronModel (POS - Part of speech tags)

In [16]:
pos = PerceptronModel.pretrained("pos_anc", 'en')\
      .setInputCols("document", "token")\
      .setOutputCol("pos")

In [17]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer,
 pos
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)


In [18]:
result = pipelineModel.transform(news_df)

result.show(5)

In [19]:
result.select('token.result','pos.result').show(5, truncate=100)

In [20]:
# applying this pipeline to top 100 rows and then converting to Pandas

result = pipelineModel.transform(news_df.limit(100))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'token.begin', 'token.end', 'stem.result',  'lemma.result', 'pos.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end"),
        F.expr("cols['3']").alias("stem"),
        F.expr("cols['4']").alias("lemma"),
        F.expr("cols['5']").alias("pos")).toPandas()

result_df.head(10) 

In [21]:
# same in LightPipeline

light_model = LightPipeline(pipelineModel)

light_result = light_model.annotate('Unions representing workers at Turner Newall say they are disappointed after talks with stricken parent firm Federal Mogul.')

list(zip(light_result['token'], light_result['stem'], light_result['lemma'], light_result['pos']))

In [22]:
# applying POS chunker to find a custom pattern

chunker = Chunker()\
    .setInputCols(["document", "pos"])\
    .setOutputCol("chunk")\
    .setRegexParsers(["<NNP>+", "<DT>?<JJ>*<NN>"])

# NNP: Proper Noun
# NN: COmmon Noun
# DT: Determinator (e.g. the)
# JJ: Adjective

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer,
 pos,
 chunker
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [23]:
result = pipelineModel.transform(news_df.limit(100))

result.show(5)

In [24]:

result_df = result.select(F.explode(F.arrays_zip('chunk.result', 'chunk.begin',  'chunk.end')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end")).toPandas()

result_df.head(10)

## Dependency Parser

In [26]:
dep_parser = DependencyParserModel.pretrained('dependency_conllu')\
        .setInputCols(["document", "pos", "token"])\
        .setOutputCol("dependency")

In [27]:
typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu')\
        .setInputCols(["token", "pos", "dependency"])\
        .setOutputCol("dependency_type")

In [28]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer,
 pos,
 dep_parser,
 typed_dep_parser
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [29]:
result = pipelineModel.transform(news_df.limit(100))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'token.begin',  'token.end', 'dependency.result', 'dependency_type.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end"),
        F.expr("cols['3']").alias("dependency"),
        F.expr("cols['4']").alias("dependency_type")).toPandas()

result_df.head(10)

## SpellChecker

In [31]:
spell_checker = NorvigSweetingModel.pretrained('spellcheck_norvig')\
        .setInputCols("token")\
        .setOutputCol("corrected")


In [32]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer,
 pos,
 spell_checker
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [33]:
result = pipelineModel.transform(news_df.limit(100))


In [34]:

result_df = result.select(F.explode(F.arrays_zip('token.result', 'corrected.result', 'stem.result',  'lemma.result', 'pos.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("corrected"),
        F.expr("cols['2']").alias("stem"),
        F.expr("cols['3']").alias("lemma"),
        F.expr("cols['4']").alias("pos")).toPandas()

result_df.head(10)

In [35]:
# same in LightPipeline

light_model = LightPipeline(pipelineModel)

light_result = light_model.annotate('The patint has pain and headace')

list(zip(light_result['token'], light_result['corrected']))


## Word Embeddings (Glove)

In [37]:
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")
    

In [38]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [39]:
result = pipelineModel.transform(news_df.limit(10))


In [40]:
result.select('embeddings.embeddings').take(1)

In [41]:
result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'embeddings.embeddings')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("embeddings"))

result_df.show(10, truncate=100)

## Elmo Embeddings

In [43]:
elmo_embeddings = ElmoEmbeddings.pretrained('elmo')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")
    

In [44]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 elmo_embeddings
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'embeddings.embeddings')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("elmo_embeddings"))

result_df.show(truncate=100)

## Bert Embeddings

In [46]:
bert_embeddings = BertEmbeddings.pretrained('bert_base_uncased')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")
    

In [47]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 bert_embeddings
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'embeddings.embeddings')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("bert_embeddings"))

result_df.show(truncate=100)

## UniversalSentenceEncoder

In [49]:
# no need for token columns 
use_embeddings = UniversalSentenceEncoder.pretrained('tfhub_use').\
  setInputCols(["document"]).\
  setOutputCol("sentence_embeddings")

    

In [50]:
nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 use_embeddings
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('document.result', 'sentence_embeddings.embeddings')).alias("cols")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("USE_embeddings"))

result_df.show(truncate=100)

### Loading Models from local

In [52]:
glove_embeddings = WordEmbeddingsModel.load('/root/cache_pretrained/glove_100d_en_2.4.0_2.4_1579690104032').\
  setInputCols(["document", 'token']).\
  setOutputCol("glove_embeddings")

    

### Using your own Word embeddings in Spark NLP

In [54]:
custom_embeddings = WordEmbeddings()\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("my_embeddings")\
  .setStoragePath('PubMed-shuffle-win-2.bin', "BINARY")\
  .setDimension(200)

## Getting Sentence Embeddings from Glove, Elmo and BERT

In [56]:
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE") # or SUM


nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings,
 embeddingsSentence
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('document.result', 'sentence_embeddings.embeddings')).alias("cols")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("sentence_embeddings"))

result_df.show(truncate=100)



### Cosine similarity between two embeddings (sentence similarity)

In [58]:
from scipy.spatial import distance

import numpy as np

v1 = result_df.select('sentence_embeddings').take(2)[0][0]

v2 = result_df.select('sentence_embeddings').take(2)[][0]

1 - distance.cosine(np.array(v1), np.array(v2))

In [59]:
v2 = result_df.select('sentence_embeddings').take(2)[0][0]

1 - distance.cosine(np.array(v1), np.array(v2))

## NERDL Model

### Public NER (CoNLL 2003)

Entities

``` PERSON, LOCATION, ORGANIZATION, MISC ```

In [63]:
public_ner = NerDLModel.pretrained("ner_dl", 'en') \
          .setInputCols(["document", "token", "embeddings"]) \
          .setOutputCol("ner")

In [64]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# ner_dl model is trained with glove_100d. So we use the same embeddings in the pipeline
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d').\
  setInputCols(["document", 'token']).\
  setOutputCol("embeddings")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings,
 public_ner
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)


In [65]:
result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df.show(50, truncate=100)



### NerDL OntoNotes 100D

Entities

``` 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART' ```

In [68]:
onto_ner = NerDLModel.pretrained("onto_100", 'en') \
          .setInputCols(["document", "token", "embeddings"]) \
          .setOutputCol("ner")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings,
 onto_ner
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)


In [69]:
result = pipelineModel.transform(news_df.limit(10))

result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df.show(50, truncate=100)


### Getting the NER chunks with NER Converter

In [71]:

ner_converter = NerConverter() \
  .setInputCols(["document", "token", "ner"]) \
  .setOutputCol("ner_chunk")


nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings,
 onto_ner,
 ner_converter
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(news_df.limit(10))



In [72]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

In [73]:
# fullAnnotate in LightPipeline

light_model = LightPipeline(pipelineModel)

light_result = light_model.fullAnnotate('Unions representing workers at Turner Newall say they are disappointed after talks with stricken parent firm Federal Mogul and Mike Fitzpatrick in Canada.')


chunks = []
entities = []

for n in light_result[0]['ner_chunk']:
        
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    
    
import pandas as pd

df = pd.DataFrame({'chunks':chunks, 'entities':entities})

df

## Highlight the entities

In [75]:
import random
from IPython.core.display import display, HTML


def get_color():
    r = lambda: random.randint(100,255)
    return '#%02X%02X%02X' % (r(),r(),r())


In [76]:
from spacy import displacy

def show_html_spacy(annotated_text, filter_labels=True):

    label_list = []
    sent_dict_list = []
    
    for n in annotated_text['ner_chunk']:

        ent = {'start': n.begin, 'end':n.end+1, 'label':n.metadata['entity'].upper()}
        
        label_list.append(n.metadata['entity'].upper())

        sent_dict_list.append(ent)
   
    document_text = [{'text':annotated_text['document'][0].result, 'ents':sent_dict_list,'title':None}]

    label_list = list(set(label_list))
                
    label_color={}
    
    for l in label_list:
        
        label_color[l]=get_color()
    
    colors = {k:label_color[k] for k in label_list}
        
    
    html_text = displacy.render(document_text, style='ent', jupyter=True, manual=True, options= {"ents": label_list, 'colors': colors})

    return html_text

In [77]:
ann_text = light_model.fullAnnotate('Unions representing workers at Turner Newall say they are disappointed after talks with stricken parent firm Federal Mogul and Mike Fitzpatrick in Canada.')

show_html_spacy (ann_text[0])

In [78]:
def get_NER_html (annotated_text):
    
    light_data=annotated_text

    html_output=''
    
    problem_flag = False
    new_problem = []
    problem_list = []
    
    label_list = list(set([i.split('-')[1] for i in light_data['ner'] if i!='O']))
    
        
    label_color={}
    
    for l in label_list:
        
        label_color[l]=get_color()
            
    for index, this_token in enumerate(light_data['token']):

        try:
            ent = light_data['ner'][index].split('-')[1]
        except:
            ent = light_data['ner'][index]
        
       
        if ent in label_list:
            color = label_color[ent]
            html_output+='<SPAN style="background-color: {}">'.format(color) + this_token + " </SPAN>"
        else:
            html_output+=this_token + " "
        

    html_output += '</div>'
    html_output += '<div>Color codes:'
    

    for l in label_list:
        
        html_output += '<SPAN style="background-color: {}">{}</SPAN>, '.format(label_color[l],l)
   
    
    return display(HTML(html_output))
    



In [79]:
ann_text = light_model.annotate('Unions representing workers at Turner Newall say they are disappointed after talks with stricken parent firm Federal Mogul and Mike Fitzpatrick in Canada.')

get_NER_html (ann_text)