# Example of extracting table data from Microsoft PPT documents

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [None]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [None]:
import os
import sys

if AWS_ACCESS_KEY_ID != "":
    os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
    os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
    
if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# or install from local path
%pip install ../../python/dist/spark-ocr-3.7.0+spark30.tar.gz

## Initialization of spark session

In [14]:
from sparkocr import start


spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version="3.1.3")

spark
spark.sparkContext.setLogLevel("ERROR")

Spark version: 3.0.2
Spark NLP version: 2.5.5
Spark OCR version: 3.7.0



## Import OCR transformers

In [15]:
#from sparkocr.transformers import *
from sparkocr.transformers import PptToTextTable

from sparkocr.utils import display_image
from pyspark.sql.functions import collect_list,col

## Read PPT document as binary file

In [16]:
import pkg_resources
ppt_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/ppt/niagara_overview.ppt')
ppt_example_df = spark.read.format("binaryFile").load(ppt_example).cache()

## Preview document using `PptToPdf` and `PptToImage` transformers

In [None]:
#image_df = PdfToImage().transform(PptToPdf().setOutputCol("content").transform(ppt_example_df))
#for r in image_df.select("image").collect():
#    display_image(r.image)

## Extract text using `PptToText` transformer

In [19]:
ppt_to_table = PptToTextTable()
ppt_to_table.setInputCol("content")
ppt_to_table.setOutputCol("tables")

result = ppt_to_table.transform(ppt_example_df)

## Display result DataFrame

In [20]:
result.show()

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+-------------------+--------+--------------------+---------+
|                path|   modificationTime|  length|              tables|exception|
+--------------------+-------------------+--------+--------------------+---------+
|file:/Users/chico...|2021-08-28 10:10:57|14923264|[[0, 0, 32.0, 54....|     null|
+--------------------+-------------------+--------+--------------------+---------+





## Display extracted text of cell from second row and first column

In [21]:
result.select(result["tables.chunks"].getItem(1)["chunkText"][0]).show()

+-------------------------------------------+
|tables.chunks AS chunks#126[1].chunkText[0]|
+-------------------------------------------+
|                           GAS PHASE REA...|
+-------------------------------------------+



## Display extracted data in JSON format

In [22]:
import json
df_json = result.select("tables").toJSON()
for row in df_json.collect():
    print(json.dumps(json.loads(row), indent=4))

{
    "tables": {
        "area": {
            "index": 0,
            "page": 0,
            "x": 32.0,
            "y": 54.0,
            "width": 656.0,
            "height": 470.5,
            "score": 0.0,
            "label": "0"
        },
        "chunks": [
            [
                {
                    "chunkText": "Reaction",
                    "x": 32.0,
                    "y": 85.5,
                    "width": 200.0,
                    "height": 31.125
                },
                {
                    "chunkText": "Rate",
                    "x": 232.0,
                    "y": 85.5,
                    "width": 99.125,
                    "height": 31.125
                },
                null,
                {
                    "chunkText": "Units",
                    "x": 331.125,
                    "y": 85.5,
                    "width": 116.875,
                    "height": 31.125
                },
                {
                    "chunkT