### This notebook shows an example of extracting coordinates from PDF files
### Immediately after, we use a PoisitionFinder based pipeline to match chunk entities back in the original PDF

In [1]:
import sys

sys.path.append('/home/saif/repos/spark-nlp/python')

from sparknlp.ocr import OcrHelper, Coordinate
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp

In [2]:
spark = sparknlp.start()

In [3]:
ocr = OcrHelper()

In [4]:
ocr.setSplitPages(True)

In [5]:
data = ocr.createDataset(spark, './input_coordinates/')

Here we get the OCR output, with various useful attributes

In [6]:
data.show(2)

+--------------------+-------+------+----------+----------+--------------------+--------------------+
|                text|pagenum|method|noiselevel|confidence|           positions|            filename|
+--------------------+-------+------+----------+----------+--------------------+--------------------+
|Alexandria is the...|      0|  text|       0.0|       0.0|[[[[A, 1, 72.024,...|file:/home/saif/r...|
|Alexandria was fo...|      1|  text|       0.0|       0.0|[[[[A, 2, 72.024,...|file:/home/saif/r...|
+--------------------+-------+------+----------+----------+--------------------+--------------------+
only showing top 2 rows



In [7]:
data.select('filename').distinct().show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------+
|filename                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------+
|file:/home/saif/repos/spark-nlp-workshop/jupyter/annotation/english/ocr/input_coordinates/thomas_edison_sample.pdf |
|file:/home/saif/repos/spark-nlp-workshop/jupyter/annotation/english/ocr/input_coordinates/alexandria_multi_page.pdf|
+-------------------------------------------------------------------------------------------------------------------+



In [8]:
data.select('text').take(1)[0]['text']

'Alexandria is the second-largest city in Egypt and a major economic centre, \nextending about 32 km (20 mi) along the coast of the Mediterranean Sea in \nthe north central part of the country. \n \n \n \n \nIts low elevation on the Nile delta makes it highly vulnerable to rising sea \nlevels. Alexandria is an important industrial center because of its natural \ngas and oil pipelines from Suez. Alexandria is also a popular tourist \ndestination. \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n'

Here we show the innter structure of OCR position matrix

In [9]:
data.select('positions').take(1)[0]['positions'][0]['mapping'][:20]

[Row(c='A', p=1, x=72.02400207519531, y=706.9000244140625, width=9.364677429199219, height=6.5847601890563965),
 Row(c='l', p=1, x=81.38867950439453, y=706.9000244140625, width=3.11688232421875, height=6.5847601890563965),
 Row(c='e', p=1, x=84.50556182861328, y=706.9000244140625, width=7.806236267089844, height=6.5847601890563965),
 Row(c='x', p=1, x=92.31179809570312, y=706.9000244140625, width=7.019996643066406, height=6.5847601890563965),
 Row(c='a', p=1, x=99.14927673339844, y=706.9000244140625, width=7.806243896484375, height=6.5847601890563965),
 Row(c='n', p=1, x=106.95552062988281, y=706.9000244140625, width=7.806243896484375, height=6.5847601890563965),
 Row(c='d', p=1, x=114.76176452636719, y=706.9000244140625, width=7.806243896484375, height=6.5847601890563965),
 Row(c='r', p=1, x=122.56800842285156, y=706.9000244140625, width=4.675323486328125, height=6.5847601890563965),
 Row(c='i', p=1, x=127.24333190917969, y=706.9000244140625, width=3.11688232421875, height=6.584760189

__Here we proceed to create a Pipeline with a TextMatcher and a PositionFinder, consuming the positions column__

In [10]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")\
  .setCleanupMode("shrink_each")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

extractor = TextMatcher()\
  .setEntities("coordinate_entities.txt")\
  .setInputCols(["token", "document"])\
  .setOutputCol("entities")

finder = PositionFinder()\
  .setInputCols(["entities"])\
  .setOutputCol("coordinates")\
  .setPageMatrixCol("positions")

pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    extractor,
    finder
  ])


In [11]:
model = pipeline.fit(data)

In [13]:
extracted = model.transform(data)

In [14]:
extracted.select('document.result').take(1)[0]['result']

['Alexandria is the second-largest city in Egypt and a major economic centre, extending about 32 km (20 mi) along the coast of the Mediterranean Sea in the north central part of the country.     Its low elevation on the Nile delta makes it highly vulnerable to rising sea levels. Alexandria is an important industrial center because of its natural gas and oil pipelines from Suez. Alexandria is also a popular tourist destination.                 ']

In [15]:
extracted.select('sentence.result').take(1)[0]['result'][2]

'Alexandria is an important industrial center because of its natural gas and oil pipelines from Suez.'

__Here we show matching entities of the TextMathcer__

In [28]:
extracted.select('entities').collect()[0]['entities']

[Row(annotatorType='chunk', begin=41, end=45, result='Egypt', metadata={'sentence': '0', 'chunk': '0'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='chunk', begin=129, end=145, result='Mediterranean Sea', metadata={'sentence': '0', 'chunk': '1'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='chunk', begin=160, end=171, result='central part', metadata={'sentence': '0', 'chunk': '2'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='chunk', begin=238, end=254, result='highly vulnerable', metadata={'sentence': '0', 'chunk': '3'}, embeddings=[], sentence_embeddings=[]),
 Row(annotatorType='chunk', begin=305, end=321, result='industrial center', metadata={'sentence': '0', 'chunk': '4'}, embeddings=[], sentence_embeddings=[])]

__Here a sample of the Position Finder coordinates__

In [43]:
extraction = extracted.select('coordinates.x', 'coordinates.y').take(1)[0]
print('x: ' + str(extraction['x']))
print()
print('y: ' + str(extraction['y']))

x: [314.76116943359375, 314.76116943359375, 402.94482421875, 314.76116943359375, 402.94482421875, 131.14645385742188, 314.76116943359375, 402.94482421875, 131.14645385742188, 336.46734619140625, 314.76116943359375, 402.94482421875, 131.14645385742188, 336.46734619140625, 281.5292663574219]

y: [706.9000244140625, 706.9000244140625, 689.5, 706.9000244140625, 689.5, 672.0999755859375, 706.9000244140625, 689.5, 672.0999755859375, 545.22998046875, 706.9000244140625, 689.5, 672.0999755859375, 545.22998046875, 527.8300170898438]


### With the OCR dataset, we pick each file and let it draw the rectangles in new output PDF files

In [44]:
ocr.drawRectanglesDataset(spark, extracted, output_location='./output_coordinates', output_suffix='_squared')
print("Output files sent to output_coordinates folder, with red highlighted squares")

Output files sent to output_coordinates folder, with red highlighted squares


### Now an example of picking a single file, its appropriate rows and drawing to file directly

In [21]:
coordinate_rows = list(map(lambda c: c['coordinates'], extracted.select('coordinates').collect()))
coordinate_rows[0]

[Row(i=0, p=1, x=314.76116943359375, y=706.9000244140625, w=35.703704833984375, h=6.5847601890563965),
 Row(i=0, p=1, x=314.76116943359375, y=706.9000244140625, w=35.703704833984375, h=6.5847601890563965),
 Row(i=1, p=1, x=402.94482421875, y=689.5, w=119.04510498046875, h=6.5847601890563965),
 Row(i=0, p=1, x=314.76116943359375, y=706.9000244140625, w=35.703704833984375, h=6.5847601890563965),
 Row(i=1, p=1, x=402.94482421875, y=689.5, w=119.04510498046875, h=6.5847601890563965),
 Row(i=2, p=1, x=131.14645385742188, y=672.0999755859375, w=70.00346374511719, h=6.5847601890563965),
 Row(i=0, p=1, x=314.76116943359375, y=706.9000244140625, w=35.703704833984375, h=6.5847601890563965),
 Row(i=1, p=1, x=402.94482421875, y=689.5, w=119.04510498046875, h=6.5847601890563965),
 Row(i=2, p=1, x=131.14645385742188, y=672.0999755859375, w=70.00346374511719, h=6.5847601890563965),
 Row(i=3, p=1, x=336.46734619140625, y=545.22998046875, w=105.00515747070312, h=6.5847601890563965),
 Row(i=0, p=1, x=31

In [24]:
# Picking first page of Alexandria's file coordinates and pick coordinate them
alexandria_coords = list(map(lambda c: c, extracted.select('coordinates').take(1)[0][0]))
ocr.drawRectanglesToFile('./input_coordinates/alexandria_multi_page.pdf', alexandria_coords, './picked_coordinates.pdf')