# Example of parsing Genomic Findings using Spark OCR

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [2]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [None]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [4]:
# or install from local path
# %pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz

Processing /Users/nmelnik/IdeaProjects/spark-ocr/python/dist/spark-ocr-1.3.0rc1.tar.gz
Building wheels for collected packages: spark-ocr
  Building wheel for spark-ocr (setup.py) ... [?25l- \ | done
[?25h  Created wheel for spark-ocr: filename=spark_ocr-1.3.0rc1-cp37-none-any.whl size=5015774 sha256=3deefe3e483c62b79cfaf28b1bbce792b78f06a84e523dd20263326d4ead8835
  Stored in directory: /Users/nmelnik/Library/Caches/pip/wheels/79/ee/b7/b1d6d10a6be137d65bd31f7d0159dcc1d704587c685a48fb4e
Successfully built spark-ocr
Installing collected packages: spark-ocr
Successfully installed spark-ocr-1.3.0rc1
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Initialization of spark session

In [6]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

SparkConf Configured, Starting to listen on port: 50179
JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar


## Import OCR transformers

In [7]:
from sparkocr.transformers import *
from sparkocr.enums import *
from pyspark.ml import PipelineModel

## Define OCR transformers and pipeline

In [8]:
def pipeline():
    
    # Transforrm PDF document to images per page
    pdf_to_text = PdfToText()
    pdf_to_text.setOutputCol("text")
    pdf_to_text.setSplitPage(False)
    pdf_to_text.setTextStripper(TextStripperType.PDF_LAYOUT_TEXT_STRIPPER)

    genomic_parser = GenomicParser()
    genomic_parser.setInputCol("text")
    genomic_parser.setOutputCol("genomics")

    
    pipeline = PipelineModel(stages=[
        pdf_to_text,
        genomic_parser
    ])
    
    return pipeline

## Read PDF document as binary file

In [12]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/genomics/3.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [13]:
result = pipeline().transform(pdf_example_df).cache()

## Display results

In [16]:
print(result.select("genomics").collect()[0].genomics)

{
  "Genomic_findings" : [ {
    "name" : "MET",
    "state" : "T263M",
    "therapies_with_clinical_benefit_in_patient_tumor_type" : [ "Crizotinib" ],
    "therapies_with_clinical_benefit_in_other_tumor_type" : [ "Cabozantinib" ]
  }, {
    "name" : "ERBB3",
    "state" : "P1212S",
    "therapies_with_clinical_benefit_in_patient_tumor_type" : [ "Afatinib" ],
    "therapies_with_clinical_benefit_in_other_tumor_type" : [ "Trastuzumab-dkst", "Trastuzumab", "Pertuzumab", "Lapatinib", "Ado-trastuzumab emtansine" ]
  }, {
    "name" : "EGFR",
    "state" : "amplification, exon 19 deletion",
    "therapies_with_clinical_benefit_in_patient_tumor_type" : [ "Osimertinib", "Gefitinib", "Erlotinib", "Afatinib" ],
    "therapies_with_clinical_benefit_in_other_tumor_type" : [ "Panitumumab", "Lapatinib", "Cetuximab" ]
  } ]
}


## Clear cache

In [10]:
result.unpersist()

DataFrame[path: string, modificationTime: timestamp, length: bigint, image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,resolution:int,data:binary>, pagenum: int, confidence: double, positions: array<struct<mapping:array<struct<c:string,p:int,x:float,y:float,width:float,height:float,fontSize:int>>>>, exception: string, text: string]