# Example of parsing FoundationOne report using Spark OCR

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install spark-ocr==$version.spark24 --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# or install from local path
#%pip install --user ../../python/dist/spark-ocr-1.9.0.spark24.tar.gz

## Initialization of spark session

In [4]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 2.4.4
Spark NLP version: 2.5.5
Spark OCR version: 1.8.0



## Import OCR transformers

In [5]:
from sparkocr.transformers import *
from sparkocr.enums import *
from pyspark.ml import PipelineModel

## Define OCR transformers and pipeline

In [12]:
def pipeline():
    
    # Transforrm PDF document to images per page
    pdf_to_text = PdfToText()
    pdf_to_text.setOutputCol("text")
    pdf_to_text.setSplitPage(False)
    pdf_to_text.setSort(True)
    pdf_to_text.setTextStripper(TextStripperType.PDF_LAYOUT_TEXT_STRIPPER)

    genomic_parser = FoundationOneReportParser()
    genomic_parser.setInputCol("text")
    genomic_parser.setOutputCol("genomics")

    
    pipeline = PipelineModel(stages=[
        pdf_to_text,
        genomic_parser
    ])
    
    return pipeline

## Read PDF document as binary file

In [7]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/genomics/3.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [13]:
result = pipeline().transform(pdf_example_df).cache()

## Display results

In [14]:
print(result.select("genomics").collect()[0].genomics)

{
  "Biomarker_findings" : [ {
    "name" : "Tumor Mutation Burden",
    "state" : "TMB-Low (5Muts/Mb)",
    "actionability" : "No therapies or clinical trials. "
  }, {
    "name" : "Microsatellite status",
    "state" : "MS-Stable",
    "actionability" : "No therapies or clinical trials. "
  } ],
  "Genomic_findings" : [ {
    "name" : "MET",
    "state" : "T263M",
    "therapies_with_clinical_benefit_in_patient_tumor_type" : [ "Crizotinib" ],
    "therapies_with_clinical_benefit_in_other_tumor_type" : [ "Cabozantinib" ]
  }, {
    "name" : "ERBB3",
    "state" : "P1212S",
    "therapies_with_clinical_benefit_in_patient_tumor_type" : [ "Afatinib" ],
    "therapies_with_clinical_benefit_in_other_tumor_type" : [ "Trastuzumab-dkst", "Trastuzumab", "Pertuzumab", "Lapatinib", "Ado-trastuzumab emtansine" ]
  }, {
    "name" : "EGFR",
    "state" : "amplification, exon 19 deletion",
    "therapies_with_clinical_benefit_in_patient_tumor_type" : [ "Osimertinib", "Gefitinib", "Erlotinib", "Afa

## Clear cache

In [10]:
result.unpersist()

DataFrame[path: string, modificationTime: timestamp, length: bigint, image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,resolution:int,data:binary>, pagenum: int, confidence: double, positions: array<struct<mapping:array<struct<c:string,p:int,x:float,y:float,width:float,height:float,fontSize:int>>>>, exception: string, text: string]