![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Hardcore DL by Spark-NLP

## Explain Documents with Deep Learning

In [2]:
import sys

#Spark ML and SQL
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import array_contains
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
#Spark NLP
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import *

### Let's create a Spark Session for our app

In [3]:
spark = sparknlp.start()

In [4]:
%%time
pipeline = PretrainedPipeline('explain_document_dl', lang='en')

CPU times: user 7.87 ms, sys: 3.74 ms, total: 11.6 ms
Wall time: 39.6 s


We simply send the text we want to transform and the pipeline does the work.

In [5]:
%%time
text = 'He would love to visit many beautful cities wth you. He lives in an amazing country.'
result = pipeline.annotate(text)

CPU times: user 52.6 ms, sys: 22.3 ms, total: 74.9 ms
Wall time: 3.37 s


We can see the output of each annotator below. This one is doing so many things at once!

In [6]:
list(result.keys())

['stem',
 'checked',
 'lemma',
 'document',
 'pos',
 'token',
 'ner_con',
 'ner',
 'embeddings',
 'sentence']

In [7]:
result['sentence']

['He would love to visit many beautful cities wth you.',
 'He lives in an amazing country.']

In [8]:
result['lemma']

['He',
 'would',
 'love',
 'to',
 'visit',
 'many',
 'beautful',
 'city',
 'wth',
 'you',
 '.',
 'He',
 'life',
 'in',
 'an',
 'amazing',
 'country',
 '.']

In [9]:
list(zip(result['checked'], result['pos']))

[('He', 'PRP'),
 ('Fould', 'MD'),
 ('love', 'VB'),
 ('to', 'TO'),
 ('list', 'VB'),
 ('many', 'JJ'),
 ('beautful', 'JJ'),
 ('ties', 'NNS'),
 ('with', 'NN'),
 ('you', 'PRP'),
 ('.', '.'),
 ('He', 'PRP'),
 ('live', 'VBZ'),
 ('in', 'IN'),
 ('an', 'DT'),
 ('main', 'JJ'),
 ('countrymen', 'NN'),
 ('.', '.')]

In [10]:
list(zip(result['checked'], result['pos']))

[('He', 'PRP'),
 ('Fould', 'MD'),
 ('love', 'VB'),
 ('to', 'TO'),
 ('list', 'VB'),
 ('many', 'JJ'),
 ('beautful', 'JJ'),
 ('ties', 'NNS'),
 ('with', 'NN'),
 ('you', 'PRP'),
 ('.', '.'),
 ('He', 'PRP'),
 ('live', 'VBZ'),
 ('in', 'IN'),
 ('an', 'DT'),
 ('main', 'JJ'),
 ('countrymen', 'NN'),
 ('.', '.')]

### Now let's try to use this pipleine to explain a PDF file

In [None]:
%%time
from sparknlp.ocr import OcrHelper
data = OcrHelper().createDataset(spark, './immortal_text.pdf')
data.show()

We can see the output of each annotator below.

In [6]:
pipeline.transform(data).select("ner", "checked").show()

+--------------------+--------------------+
|                 ner|             checked|
+--------------------+--------------------+
|[[named_entity, 0...|[[token, 0, 4, wo...|
+--------------------+--------------------+



In [6]:
pipeline.transform(data).select("ner", "checked").show()

+--------------------+--------------------+
|                 ner|             checked|
+--------------------+--------------------+
|[[named_entity, 0...|[[token, 0, 4, wo...|
+--------------------+--------------------+



In [13]:
result = pipeline.annotate(local_data)
list(zip(result['token'], result['ner']))

[('would', 'B-sent'),
 ('have', 'O'),
 ('been', 'O'),
 ('a', 'O'),
 ('liberation', 'O'),
 (',', 'O'),
 ('a', 'B-sent'),
 ('joy', 'O'),
 (',', 'O'),
 ('and', 'B-sent'),
 ('a', 'O'),
 ('fiesta', 'O'),
 ('.', 'O'),
 ('He', 'B-sent'),
 ('sensed', 'O'),
 ('that', 'O'),
 ('had', 'O'),
 ('he', 'O'),
 ('been', 'O'),
 ('able', 'O'),
 ('to', 'O'),
 ('choose', 'O'),
 ('or', 'O'),
 ('dream', 'O'),
 ('his', 'O'),
 ('death', 'O'),
 ('that', 'O'),
 ('night', 'B-sent'),
 (',', 'O'),
 ('this', 'O'),
 ('is', 'O'),
 ('the', 'B-sent'),
 ('death', 'O'),
 ('he', 'O'),
 ('would', 'O'),
 ('have', 'O'),
 ('dreamed', 'O'),
 ('or', 'O'),
 ('chosen', 'O'),
 ('.', 'O'),
 ('Dahlmann', 'B-sent'),
 ('firmly', 'O'),
 ('grips', 'O'),
 ('the', 'O'),
 ('knife', 'O'),
 (',', 'O'),
 ('which', 'O'),
 ('he', 'B-sent'),
 ('may', 'O'),
 ('have', 'B-sent'),
 ('no', 'O'),
 ('idea', 'O'),
 ('how', 'O'),
 ('to', 'O'),
 ('manage', 'O'),
 (',', 'O'),
 ('and', 'B-sent'),
 ('steps', 'O'),
 ('out', 'O'),
 ('into', 'O'),
 ('the', 'O'),
