# Example of extracting tables from selectable Pdfs using tabula framework

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJleHAiOjE2MjMyMjM3NDcsImlhdCI6MTU5MTY4Nzc0NywidW5pcXVlX2lkIjoiZTc2NTZjNDItYWEyMi0xMWVhLTgyMjAtYTZjZmVjOWIxMDE1In0.MxySiRfPvcGRIGmHREMKc_3XB38W4k00iiJjt2UvAO9hcft4hetaJ1s_C6Xo94meA1u5uX4KPML6eOx1dCw9TXe4f2PK5mTorrgENM5LVZaB_P75EfYyGq06Yn3BYRQDnItE6SWkq-BLSYwgJNMmd6xpax2gVel8XILyBxjpGtuIISYZ8Q3YpHWSgF6dnO8RAvdLDQ3NupY1VvKH0-PhlRYz9Hq453xX3hbAPcn631FhN_bzvsmOdfU4qGGLdLlfk1802uQ7bf8Iy-vHyfvriRCdj906z25N9-FlLo1ng-yX2RhwQESyVXSDPKYicDhrbrqy6foaIPattx1nbC_Emw"
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [5]:
# or install from local path
%pip install --user ../../python/dist/spark-ocr-1.5.0.tar.gz

Processing e:\ideaproject\spark-ocr-master\python\dist\spark-ocr-1.5.0.tar.gz
Building wheels for collected packages: spark-ocr
  Building wheel for spark-ocr (setup.py): started
  Building wheel for spark-ocr (setup.py): finished with status 'done'
  Created wheel for spark-ocr: filename=spark_ocr-1.5.0-py3-none-any.whl size=7253315 sha256=91d1f07430dac8f811abe941ced10f5b40b50e9a5c1529a917a7b53d42a4442d
  Stored in directory: c:\users\pc\appdata\local\pip\cache\wheels\e5\4d\42\9b9d11a5c5ce2bcae6da03ec982593f1c2baef04fb159121c5
Successfully built spark-ocr
Installing collected packages: spark-ocr
  Attempting uninstall: spark-ocr
    Found existing installation: spark-ocr 1.5.0
    Uninstalling spark-ocr-1.5.0:
      Successfully uninstalled spark-ocr-1.5.0
Successfully installed spark-ocr-1.5.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\pc\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


## Initialization of spark session

In [None]:
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

## Import OCR transformers

In [None]:
from sparkocr.transformers import *
from pyspark.sql.functions import collect_list,col

## Define OCR transformers and pipeline

In [None]:
def pipelineTransformer(dataframe):

    # Extract tables from PDF document per page
    pdf_to_text_table = PdfToTextTable()
    pdf_to_text_table.setInputCol("content")
    pdf_to_text_table.setOutputCol("table")
    pdf_to_text_table.setPageIndex(1)

    coordinated = pdf_to_text_table.transform(dataframe)
    transformed = coordinated.withColumn("area", col("table.area"))
    aggregated = transformed.groupBy("path", "content").agg((collect_list("area").alias("coordinates")))
     # Draw Coordinates
    draw = PdfDrawRegions()
    draw.setInputRegionsCol("coordinates")
    draw.setOutputCol("pdf_with_regions")
    draw.setLineWidth(1)

    return draw.transform(aggregated)

## Read PDF document as binary file

In [None]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/tabular-pdf/frx_2012_disclosure.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [None]:
result = pipelineTransformer(pdf_example_df).cache()

## Display results

In [None]:
pdf=result.select("pdf_with_regions").collect()[0]
output_path = "././data/pdf/tabular-output/"
if not os.path.exists(output_path):
    os.makedirs(output_path)
with open(output_path+"test.pdf", "wb") as file:
     file.write(pdf.pdf_with_regions)



In [None]:
%%bash
rm -r -f ././data/pdf/

In [None]:
## Clear cache

In [None]:
## Clear cache

## Clear cache

In [None]:
result.unpersist()