![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Use pretrained `explain_document` Pipeline

### Stages

 * DocumentAssembler
 * SentenceDetector
 * Tokenizer
 * Lemmatizer
 * Stemmer
 * Part of Speech
 * SpellChecker (Norvig)

In [2]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

### Let's create a Spark Session for our app

In [3]:
spark = sparknlp.start()

#### This is our testing document, we'll use it to exemplify all different pipeline stages.

In [4]:
testDoc = [
"French author who helped pioner the science-fiction genre. \
Verne wrate about space, air, and underwater travel before \
navigable aircrast and practical submarines were invented, \
and before any means of space travel had been devised. "
]

In [5]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')

#### We are not interested in handling big datasets, let's switch to LightPipelines for speed.

In [6]:
result = pipeline.annotate(testDoc)

#### Let's analyze these results - first let's see what sentences we detected

In [7]:
[content['sentence'] for content in result]

[['French author who helped pioner the science-fiction genre.',
  'Verne wrate about space, air, and underwater travel before navigable aircrast and practical submarines were invented, and before any means of space travel had been devised.']]

#### Now let's see how those sentences were tokenized

In [8]:
[content['token'] for content in result]

[['French',
  'author',
  'who',
  'helped',
  'pioner',
  'the',
  'science-fiction',
  'genre',
  '.',
  'Verne',
  'wrate',
  'about',
  'space',
  ',',
  'air',
  ',',
  'and',
  'underwater',
  'travel',
  'before',
  'navigable',
  'aircrast',
  'and',
  'practical',
  'submarines',
  'were',
  'invented',
  ',',
  'and',
  'before',
  'any',
  'means',
  'of',
  'space',
  'travel',
  'had',
  'been',
  'devised',
  '.']]

#### Notice some spelling errors? the pipeline takes care of that as well

In [9]:
[content['spell'] for content in result]

[['French',
  'author',
  'who',
  'helped',
  'pioneer',
  'the',
  'science-fiction',
  'genre',
  '.',
  'Verne',
  'wrote',
  'about',
  'space',
  ',',
  'air',
  ',',
  'and',
  'underwater',
  'travel',
  'before',
  'navigable',
  'aircraft',
  'and',
  'practical',
  'submarines',
  'were',
  'invented',
  ',',
  'and',
  'before',
  'any',
  'means',
  'of',
  'space',
  'travel',
  'had',
  'been',
  'devised',
  '.']]

#### Now let's see the lemmas

In [16]:
[content['lemmas'] for content in result]

[['French',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'science-fiction',
  'genre',
  '.',
  'Verne',
  'write',
  'about',
  'space',
  ',',
  'air',
  ',',
  'and',
  'underwater',
  'travel',
  'before',
  'navigable',
  'aircraft',
  'and',
  'practical',
  'submarine',
  'be',
  'invent',
  ',',
  'and',
  'before',
  'any',
  'mean',
  'of',
  'space',
  'travel',
  'have',
  'be',
  'devise',
  '.']]

#### Let's check the stems, any difference with the lemmas shown bebore?

[content['lemmas'] for content in result]

In [10]:
[content['stems'] for content in result]

[['french',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'science-fict',
  'genr',
  '.',
  'vern',
  'wrote',
  'about',
  'space',
  ',',
  'air',
  ',',
  'and',
  'underwat',
  'travel',
  'befor',
  'navig',
  'aircraft',
  'and',
  'practic',
  'submarin',
  'were',
  'invent',
  ',',
  'and',
  'befor',
  'ani',
  'mean',
  'of',
  'space',
  'travel',
  'had',
  'been',
  'devis',
  '.']]

#### Now it's the turn on Part Of Speech(POS)

In [12]:
pos = [content['pos'] for content in result]
token = [content['token'] for content in result]
# let's put token and tag together
list(zip(token[0], pos[0]))

[('French', 'JJ'),
 ('author', 'NN'),
 ('who', 'WP'),
 ('helped', 'VBD'),
 ('pioner', 'NN'),
 ('the', 'DT'),
 ('science-fiction', 'JJ'),
 ('genre', 'NN'),
 ('.', '.'),
 ('Verne', 'NNP'),
 ('wrate', 'VBD'),
 ('about', 'IN'),
 ('space', 'NN'),
 (',', ','),
 ('air', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('underwater', 'JJ'),
 ('travel', 'NN'),
 ('before', 'IN'),
 ('navigable', 'JJ'),
 ('aircrast', 'NN'),
 ('and', 'CC'),
 ('practical', 'JJ'),
 ('submarines', 'NNS'),
 ('were', 'VBD'),
 ('invented', 'VBN'),
 (',', ','),
 ('and', 'CC'),
 ('before', 'IN'),
 ('any', 'DT'),
 ('means', 'NNS'),
 ('of', 'IN'),
 ('space', 'NN'),
 ('travel', 'NN'),
 ('had', 'VBD'),
 ('been', 'VBN'),
 ('devised', 'VBN'),
 ('.', '.')]