![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# 2. Pretrained pipelines for Grammar, NER and Sentiment

In [0]:
import sparknlp

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

## Using Pretrained Pipelines

https://github.com/JohnSnowLabs/spark-nlp-models

https://nlp.johnsnowlabs.com/models

In [0]:
from sparknlp.pretrained import PretrainedPipeline

In [0]:
testDoc = '''Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

In [0]:
testDoc

### Explain Document ML

**Stages**
- DocumentAssembler
- SentenceDetector
- Tokenizer
- Lemmatizer
- Stemmer
- Part of Speech
- SpellChecker (Norvig)

In [0]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')


In [0]:
pipeline.model.stages

In [0]:
result = pipeline.annotate(testDoc)

In [0]:
result.keys()

In [0]:
result['sentence']

In [0]:
result['token']

In [0]:
list(zip(result['token'], result['pos']))

In [0]:
list(zip(result['token'], result['lemmas'], result['stems'], result['spell']))

In [0]:
import pandas as pd

df = pd.DataFrame({'token':result['token'], 
                      'corrected':result['spell'], 'POS':result['pos'],
                      'lemmas':result['lemmas'], 'stems':result['stems']})
df

Unnamed: 0,token,corrected,POS,lemmas,stems
0,Peter,Peter,NNP,Peter,peter
1,is,is,VBZ,be,i
2,a,a,DT,a,a
3,very,very,RB,very,veri
4,good,good,JJ,good,good
5,persn,person,NN,person,person
6,.,.,.,.,.
7,My,My,PRP$,My,my
8,life,life,NN,life,life
9,in,in,IN,in,in


### Explain Document DL

**Stages**
- DocumentAssembler
- SentenceDetector
- Tokenizer
- NER (NER with GloVe 100D embeddings, CoNLL2003 dataset)
- Lemmatizer
- Stemmer
- Part of Speech
- SpellChecker (Norvig)

In [0]:
pipeline_dl = PretrainedPipeline('explain_document_dl', lang='en')


In [0]:
pipeline_dl.model.stages

In [0]:
pipeline_dl.model.stages[-2].getStorageRef()

In [0]:
pipeline_dl.model.stages[-2].getClasses()

In [0]:
result = pipeline_dl.annotate(testDoc)

result.keys()

In [0]:
result['entities']

In [0]:
df = pd.DataFrame({'token':result['token'], 'ner_label':result['ner'],
                      'spell_corrected':result['checked'], 'POS':result['pos'],
                      'lemmas':result['lemma'], 'stems':result['stem']})

df

Unnamed: 0,token,ner_label,spell_corrected,POS,lemmas,stems
0,Peter,B-PER,Peter,NNP,Peter,peter
1,is,O,is,VBZ,be,i
2,a,O,a,DT,a,a
3,very,O,very,RB,very,veri
4,good,O,good,JJ,good,good
5,persn,O,person,NN,person,person
6,.,O,.,.,.,.
7,My,O,My,PRP$,My,my
8,life,O,life,NN,life,life
9,in,O,in,IN,in,in


### Recognize Entities DL

In [0]:
recognize_entities = PretrainedPipeline('recognize_entities_dl', lang='en')


In [0]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

result = recognize_entities.annotate(testDoc)

list(zip(result['token'], result['ner']))

### Clean Stop Words

In [0]:
clean_stop = PretrainedPipeline('clean_stop', lang='en')


In [0]:
result = clean_stop.annotate(testDoc)
result.keys()

In [0]:
' '.join(result['cleanTokens'])

### Spell Checker 

(Norvig Algo)

ref: https://norvig.com/spell-correct.html

In [0]:
spell_checker = PretrainedPipeline('check_spelling', lang='en')


In [0]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

result = spell_checker.annotate(testDoc)

result.keys()

In [0]:
list(zip(result['token'], result['checked']))

### Parsing a list of texts

In [0]:
testDoc_list = ['French author who helped pioner the science-fiction genre.',
'Verne wrate about space, air, and underwater travel before navigable aircrast',
'Practical submarines were invented, and before any means of space travel had been devised.']

testDoc_list

In [0]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')


In [0]:
result_list = pipeline.annotate(testDoc_list)

len (result_list)

In [0]:
result_list[0]

### Using fullAnnotate to get more details

```
annotatorType: String, 
begin: Int, 
end: Int, 
result: String, (this is what annotate returns)
metadata: Map[String, String], 
embeddings: Array[Float]
```

In [0]:
text = 'Peter Parker is a nice guy and lives in New York'

In [0]:
# pipeline_dl >> explain_document_dl

detailed_result = pipeline_dl.fullAnnotate(text)

In [0]:
detailed_result

In [0]:
detailed_result[0]['entities']

In [0]:
detailed_result[0]['entities'][0].result

In [0]:
chunks=[]
entities=[]
for n in detailed_result[0]['entities']:
        
  chunks.append(n.result)
  entities.append(n.metadata['entity']) 
    
df = pd.DataFrame({'chunks':chunks, 'entities':entities})
df    

Unnamed: 0,chunks,entities
0,Peter Parker,PER
1,New York,LOC


In [0]:
tuples = []

for x,y,z in zip(detailed_result[0]["token"], detailed_result[0]["pos"], detailed_result[0]["ner"]):

  tuples.append((int(x.metadata['sentence']), x.result, x.begin, x.end, y.result, z.result))

df = pd.DataFrame(tuples, columns=['sent_id','token','start','end','pos', 'ner'])

df


Unnamed: 0,sent_id,token,start,end,pos,ner
0,0,Peter,0,4,NNP,B-PER
1,0,Parker,6,11,NNP,I-PER
2,0,is,13,14,VBZ,O
3,0,a,16,16,DT,O
4,0,nice,18,21,JJ,O
5,0,guy,23,25,NN,O
6,0,and,27,29,CC,O
7,0,lives,31,35,NNS,O
8,0,in,37,38,IN,O
9,0,New,40,42,NNP,B-LOC


### Sentiment Analysis

#### Vivek algo

paper: `Fast and accurate sentiment classification using an enhanced Naive Bayes model`

https://arxiv.org/abs/1305.6143

code `https://github.com/vivekn/sentiment`

In [0]:
sentiment = PretrainedPipeline('analyze_sentiment', lang='en')

In [0]:
result = sentiment.annotate("The movie I watched today was not a good one")

result['sentiment']

#### DL version (trained on imdb)

In [0]:
sentiment_imdb = PretrainedPipeline('analyze_sentimentdl_use_imdb', lang='en')

In [0]:
sentiment_imdb_glove = PretrainedPipeline('analyze_sentimentdl_glove_imdb', lang='en')

In [0]:
comment = '''
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfectly suited for those who like Horror movies but without the obnoxious gore.
'''
result = sentiment_imdb_glove.annotate(comment)

result['sentiment']

In [0]:
sentiment_imdb_glove.fullAnnotate(comment)[0]['sentiment']

#### DL version (trained on twitter dataset)

In [0]:
sentiment_twitter = PretrainedPipeline('analyze_sentimentdl_use_twitter', lang='en')

In [0]:
result = sentiment_twitter.annotate("The movie I watched today was a good one.")

result['sentiment']

End of Notebook #