In [5]:
# some basic imports
import sparknlp
sparknlp.start()

In [6]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Spark NLP with OCR') \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.jars.repositories", "http://repo.spring.io/plugins-release") \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.2.1,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.2.1,javax.media.jai:com.springsource.javax.media.jai.core:1.1.3") \
    .getOrCreate()

In [7]:
# let's do OCR
from sparknlp.ocr import OcrHelper
ocrHelper = OcrHelper()
ocrHelper.setPreferredMethod("image")


#If you do this locally you can use file:/// or hdfs:/// if the files are hosted in Hadoop
dataset = ocrHelper.createDataset(spark, './immortal_image.pdf')
dataset.collect()

[Row(text='would have been a liberation, a joy, and a fiesta.\nHe sensed that had he been able to choose or\ndream his death that night, this is the death he\nwould have dreamed or chosen.\n\nDahlmann firmly grips the knife, which he\nmay have no idea how to manage, and steps out\ninto the plains.\n', pagenum=0, method='image', noiselevel=0.0, confidence=0.0, positions=None, filename='file:/home/jose/Documents/presentation-oreilly-ai/notebook/immortal_image.pdf'),
 Row(text='The Aleph\n(1949)\n', pagenum=0, method='image', noiselevel=0.0, confidence=0.0, positions=None, filename='file:/home/jose/Documents/presentation-oreilly-ai/notebook/immortal_image.pdf'),
 Row(text='The Immortal\n', pagenum=0, method='image', noiselevel=0.0, confidence=0.0, positions=None, filename='file:/home/jose/Documents/presentation-oreilly-ai/notebook/immortal_image.pdf'),
 Row(text='Solomon saith: There is no new thing upon\nthe earth. So that as Plato had an imagination,\nthat all knowledge was but remembra

In [39]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

# create pipeline, (almost) from scratch

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = WordEmbeddingsModel().pretrained() \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

spell = NorvigSweetingModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("spell")

ner_dl = NerDLModel().pretrained() \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner_dl")

ner_converter = NerConverter() \
    .setInputCols(["document", "token", "ner_dl"]) \
    .setOutputCol("ner_chunk")
    
finisher = Finisher() \
    .setInputCols(["document", "token", "ner_dl", "ner_chunk"]) \
    .setIncludeMetadata(True)    
    
pipeline = Pipeline(stages = [documentAssembler, tokenizer, embeddings, spell, ner_dl, ner_converter])    

glove_100d download started this may take some time.
Approximate size to download 144.3 MB
[OK!]
spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]
ner_dl_contrib download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [61]:
rows = pipeline.fit(dataset).transform(dataset).select('ner_chunk').collect()[3][:10] # 3 and 4
for r in rows[0]:
    print(str(r), '\n')

Row(annotatorType='chunk', begin=0, end=6, result='Solomon', metadata={'sentence': '0', 'chunk': '0', 'entity': 'PER'}, embeddings=[], sentence_embeddings=[]) 

Row(annotatorType='chunk', begin=64, end=68, result='Plato', metadata={'sentence': '0', 'chunk': '1', 'entity': 'PER'}, embeddings=[], sentence_embeddings=[]) 

Row(annotatorType='chunk', begin=133, end=139, result='Solomon', metadata={'sentence': '0', 'chunk': '2', 'entity': 'PER'}, embeddings=[], sentence_embeddings=[]) 

Row(annotatorType='chunk', begin=197, end=209, result='Francis Bacon', metadata={'sentence': '0', 'chunk': '3', 'entity': 'PER'}, embeddings=[], sentence_embeddings=[]) 

Row(annotatorType='chunk', begin=220, end=224, result='LVIII', metadata={'sentence': '0', 'chunk': '4', 'entity': 'ORG'}, embeddings=[], sentence_embeddings=[]) 



In [68]:
# complete pretrained pipelines
from sparknlp.pretrained import PretrainedPipeline
pipeline = PretrainedPipeline('explain_document_dl_fast', 'en')
annotations = pipeline.annotate("Harry Potter is a graet muvie")

explain_document_dl_fast download started this may take some time.
Approx size to download 167.4 MB
[OK!]


In [67]:
checked = annotations['checked']
ner = annotations['ner']
list(zip(checked, ner))

[('Harry', 'I-PER'),
 ('Potter', 'I-PER'),
 ('is', 'O'),
 ('a', 'O'),
 ('great', 'O'),
 ('movie', 'O')]