In [1]:
import sys

sys.path.append('/home/saif/repos/spark-nlp/python')

from sparknlp.ocr import OcrHelper, Coordinate
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp

In [2]:
spark = sparknlp.start()

In [3]:
ocr = OcrHelper()

In [4]:
data = ocr.createDataset(spark, './simple_document_paragraph.pdf')

In [5]:
data.show()

+--------------------+-------+------+----------+----------+--------------------+--------------------+
|                text|pagenum|method|noiselevel|confidence|           positions|            filename|
+--------------------+-------+------+----------+----------+--------------------+--------------------+
|Hello world, this...|      0|  text|       0.0|       0.0|[[[[H, 72.024, 70...|file:/home/saif/r...|
+--------------------+-------+------+----------+----------+--------------------+--------------------+



In [6]:
data.select('positions').schema[0]

StructField(positions,ArrayType(StructType(List(StructField(mapping,ArrayType(StructType(List(StructField(c,StringType,true),StructField(x,FloatType,false),StructField(y,FloatType,false),StructField(width,FloatType,false),StructField(height,FloatType,false))),true),true),StructField(lowerLeftX,FloatType,false),StructField(lowerLeftY,FloatType,false))),true),true)

In [7]:
data.select('text').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                               |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Hello world, this is a paragraph full of cool text and the such. This paragraph is multiline so I hope you 
enjoy your ride here, which I think this is all through t

In [8]:
data.select('text').take(1)[0]['text']

'Hello world, this is a paragraph full of cool text and the such. This paragraph is multiline so I hope you \nenjoy your ride here, which I think this is all through the 2nd line already, what is the meaning of this? \n \nI have left some blanks so I continue here. \n \nGoodbye… \n'

In [9]:
data.select('positions').take(1)[0]['positions'][0]['mapping'][210:216]

[Row(c='i', x=511.0361633300781, y=695.02001953125, width=2.539215087890625, height=5.519999980926514),
 Row(c='s', x=513.542236328125, y=695.02001953125, width=4.316650390625, height=5.519999980926514),
 Row(c='?', x=517.85888671875, y=695.02001953125, width=5.11151123046875, height=5.519999980926514),
 Row(c=' ', x=522.9400024414062, y=695.02001953125, width=2.49505615234375, height=5.519999980926514),
 Row(c=' ', x=72.02400207519531, y=672.5800170898438, width=2.4950408935546875, height=5.519999980926514),
 Row(c='I', x=72.02400207519531, y=650.02001953125, width=2.7820816040039062, height=5.519999980926514)]

In [10]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")\
  .setCleanupMode("shrink_each")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

extractor = TextMatcher()\
  .setEntities("simple_document_entities.txt")\
  .setInputCols(["token", "document"])\
  .setOutputCol("entities")

finder = PositionFinder()\
  .setInputCols(["entities"])\
  .setOutputCol("coordinates")\
  .setPageMatrixCol("positions")

pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    extractor,
    finder
  ])


In [11]:
model = pipeline.fit(data)

In [12]:
extracted = model.transform(data)

In [13]:
extracted.select('document.result', 'document.begin', 'document.end').show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----+
|result                                                                                                                                                                                                                                                                         |begin|end  |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----+
|[Hello world, this is a paragraph full of cool text and the such. This paragraph is multiline so I hope you enjoy your ride here, which I thi

In [14]:
extracted.select('document.result').take(1)[0]['result']

['Hello world, this is a paragraph full of cool text and the such. This paragraph is multiline so I hope you enjoy your ride here, which I think this is all through the 2nd line already, what is the meaning of this?  I have left some blanks so I continue here.  Goodbye… ']

In [15]:
extracted.select('sentence.result').take(1)[0]['result']

['Hello world, this is a paragraph full of cool text and the such.',
 'This paragraph is multiline so I hope you enjoy your ride here, which I think this is all through the 2nd line already, what is the meaning of this?',
 'I have left some blanks so I continue here.',
 'Goodbye…']

In [16]:
extracted.select('entities').show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|entities                                                                                                                                                                                               |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[chunk, 41, 49, cool text, [sentence -> 0, chunk -> 0], [], []], [chunk, 83, 91, multiline, [sentence -> 0, chunk -> 1], [], []], [chunk, 227, 237, some blanks, [sentence -> 0, chunk -> 2], [], []]]|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
extracted.select('token').take(1)[0]['token']

[Row(annotatorType='token', begin=0, end=4, result='Hello', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=6, end=10, result='world', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=11, end=11, result=',', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=13, end=16, result='this', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=18, end=19, result='is', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=21, end=21, result='a', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=23, end=31, result='paragraph', metadata={'sentence': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='token', begin=33, end=36, result='full', metadata={'sentence': '0'}, embeddings=[

In [18]:
extracted.select('coordinates').show(truncate=False)

+----------------------------------------------------------------------------------------------------------+
|coordinates                                                                                               |
+----------------------------------------------------------------------------------------------------------+
|[[241.40068, 709.54, 38.87184, 5.52], [423.43, 709.54, 39.57843, 5.52], [118.94, 650.02, 55.409744, 5.52]]|
+----------------------------------------------------------------------------------------------------------+



In [19]:
coordinate_rows = list(map(lambda c: c['coordinates'], extracted.select('coordinates').collect()))[0]
coordinate_rows

[Row(x=241.4006805419922, y=709.5399780273438, w=38.87184143066406, h=5.519999980926514),
 Row(x=423.42999267578125, y=709.5399780273438, w=39.57843017578125, h=5.519999980926514),
 Row(x=118.94000244140625, y=650.02001953125, w=55.40974426269531, h=5.519999980926514)]

In [20]:
coordinates = list(map(lambda cr: Coordinate(cr['x'], cr['y'], cr['w'], cr['h']), coordinate_rows))
coordinates

[<sparknlp.ocr.Coordinate at 0x7ffe7d160b38>,
 <sparknlp.ocr.Coordinate at 0x7ffe7d160198>,
 <sparknlp.ocr.Coordinate at 0x7ffe7d1608d0>]

In [21]:
coordinates[0].y

709.5399780273438

In [23]:
ocr.drawRectangle(spark, './simple_document_paragraph.pdf', coordinates)