In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Summarize contracts (PDF files) using OCR (Vision API) and LLM (Gemini)

<table align="left">

<a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/summarization/ocr_contract_summarization_llm.ipynb">
<img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
View on GitHub
</a>
</td>
<td>
<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/generative_ai/summarization/ocr_contract_summarization_llm.ipynb">
<img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
</a>
</td>
</table>

## Overview

This notebook shows how to perform OCR and summarization using LLM for a large number of contract PDF files in a GCS bucket

#### **Steps**
Using Spark, 
1) It reads the table of the [Contract Understanding Atticus Dataset (CUAD)](https://www.atticusprojectai.org/cuad) dataset located in the [gs://dataproc-metastore-public-binaries/cuad_v1/full_contract_pdf/](https://console.cloud.google.com/storage/browser/dataproc-metastore-public-binaries/cuad_v1)  
   We will create a metadata table poiting to the paths of the image files in the bucket.  
2) It runs OCR using Vision API - it start a series of async operations and then checks its completion status.
3) It calls [Vertex AI Gemini API](https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/api-quickstart#try_text_prompts) to summarize the text.
4) It saves the output to BigQuery

#### Related content

- [Design summarization prompts](https://cloud.google.com/vertex-ai/docs/generative-ai/text/summarization-prompts)

## Setup

#### Identity and Access Management (IAM)

Make sure the service account running this notebook has the required permissions:

- **Run the notebook**
  - AI Platform Notebooks Service Agent
  - Notebooks Admin
  - Vertex AI Administrator
- **Read files from bucket**
  - Storage Object Viewer
- **Run Dataproc jobs**
  - Dataproc Service Agent
  - Dataproc Worker
- **Call Google APIs (Gemini and Vision)**
  - Service Usage Consumer
  - VisionAI Admin
- **BigQuery**
  - BigQuery Data Editor

#### Imports

In [None]:
import os
import sys
import re
import json
import time

from pyspark.sql.functions import udf, col, lit, split, explode, size, avg, count, regexp_replace, collect_list
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

import google.auth
import google.auth.transport.requests
import requests

from google.cloud import storage

In [None]:
# When using Dataproc Serverless, installed packages are automatically available on all nodes
!pip install --upgrade google-cloud-aiplatform google-cloud-vision
# When using a Dataproc cluster, you will need to install these packages during cluster creation: https://cloud.google.com/dataproc/docs/tutorials/python-configuration

#### Authentication

In [None]:
# Get credentials to authenticate with Google APIs
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

#### Setup Spark Session

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("OCR contract PDF files and summarize") \
    .enableHiveSupport() \
    .getOrCreate()

#### Parameters

In [None]:
# Change the maximum number of files you want to consider
limit_files = 5
# OCR
gcs_output_bucket = "gs://dataproc-metastore-public-binaries" # Output bucket where OCR text files will be saved
output_path_prefix = "cuad_v1/output_ocr" # path prefix after bucket name where the folder structure will be created
# BigQuery
output_dataset_bq = "output_dataset" # create the BigQuery dataset beforehand
output_table_bq = "ocr_summaries"
bq_temp_bucket_name = "workspaces-bq-temp-bucket-dev"

## Read dataset

#### Read CUAD V1 dataset from metastore

In [None]:
BINARIES_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/cuad_v1/full_contract_pdf/"
cuad_v1_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(BINARIES_BUCKET_PATH).limit(limit_files)

|                path|    modificationTime| length|             content|
|--------------------|--------------------|-------|--------------------|
|gs://dataproc-met...|2023-05-15 20:53:...|3683550|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:53:...|2881262|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:54:...|1778356|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:53:...|1557129|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:53:...|1452180|[25 50 44 46 2D 3...|

In [None]:
files_df = cuad_v1_df.select("path").withColumnRenamed("path", "pdf_path")

## Run OCR using Vision API

#### Run OCR - Start operations

In [None]:
from google.cloud import vision_v1 as vision

In [None]:
def perform_ocr(gcs_source_uri, gcs_output_bucket, output_path_prefix):

    gcs_uri, file_name = os.path.split(gcs_source_uri)
    sub_paths = re.sub(r"gs://[^/]+", "", gcs_uri, 1)
    gcs_destination_uri = gcs_output_bucket + "/" + output_path_prefix + sub_paths + "/" + file_name

    # Prepare the asynchronous request
    async_request = vision.AsyncAnnotateFileRequest(
        features=[vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)],
        input_config = vision.InputConfig(
            gcs_source=vision.GcsSource(uri=gcs_source_uri), 
            mime_type='application/pdf'
        ),
        output_config = vision.OutputConfig(
            gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
            batch_size=100
        )
    )

    # Submit the OCR request and get the operation
    client = vision.ImageAnnotatorClient()
    operation = client.async_batch_annotate_files(requests=[async_request])

    return [gcs_destination_uri, operation.operation.name]

In [None]:
schema = StructType(
    [
        StructField("ocr_text_path", StringType(), False),
        StructField("vision_api_async_operation_name", StringType(), False)
    ]
)

perform_ocr_udf = udf(perform_ocr, schema)

In [None]:
ocr_async_op_df = files_df.withColumn("ocr_async_op", perform_ocr_udf(files_df["pdf_path"], lit(gcs_output_bucket), lit(output_path_prefix)))

In [None]:
ocr_async_op_df = ocr_async_op_df.withColumn("ocr_text_path", ocr_async_op_df["ocr_async_op"]["ocr_text_path"]) \
                                 .withColumn("vision_api_async_operation_name", ocr_async_op_df["ocr_async_op"]["vision_api_async_operation_name"]) \
                                 .drop("ocr_async_op")

In [None]:
ocr_async_op_df.show(10,50)

|                                          pdf_path|                                     ocr_text_path|                   vision_api_async_operation_name|
|--------------------------------------------------|--------------------------------------------------|--------------------------------------------------|
|gs://dataproc-metastore-public-binaries/cuad_v1...|gs://dataproc-metastore-public-binaries/cuad_v1...|projects/dataproc-workspaces-notebooks/operatio...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|gs://dataproc-metastore-public-binaries/cuad_v1...|projects/dataproc-workspaces-notebooks/operatio...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|gs://dataproc-metastore-public-binaries/cuad_v1...|projects/dataproc-workspaces-notebooks/operatio...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|gs://dataproc-metastore-public-binaries/cuad_v1...|projects/dataproc-workspaces-notebooks/operatio...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|gs://dataproc-metastore-public-binaries/cuad_v1...|projects/dataproc-workspaces-notebooks/operatio...|

In [None]:
ocr_async_op_df.cache()

#### Check status of OCR operations

In [None]:
def check_completion(operation_name):
    client = vision.ImageAnnotatorClient()
    
    operation = client.get_operation({'name': operation_name})

    status_messages = {
        True: 'done',
        False: 'processing',
        'error': lambda op: f'Operation error: code {op.metadata["error"]["code"]} and message {op.metadata["error"]["message"]}'
    }

    result = status_messages.get(operation.done, 'unknown')  # Handle unexpected states
    if result == 'error':
        result = result(operation)
        
    return result

In [None]:
check_completion = udf(check_completion)

In [None]:
time.sleep(45)

In [None]:
check_completion_df = ocr_async_op_df.withColumn("status", check_completion(ocr_async_op_df["vision_api_async_operation_name"]))

#### Get processed OCR text files from bucket

In [None]:
### Spark User Defined Function (UDF)
def read_completed_ocr(path):

    bucket = re.compile(r"gs://[^/]+").findall(path)[0]
    prefix = re.sub(r"gs://[^/]+", "", path, 1)[1:]

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket[5:])
    blobs = [blob for blob in list(bucket.list_blobs(prefix=prefix)) if not blob.name.endswith('/')]

    try:
        
        json_string = blobs[0].download_as_bytes().decode("utf-8")
        structured_ocr = json.loads(json_string)
        ocr_text = ""
        ocr_pages = []
        for page in structured_ocr['responses']:
            if('fullTextAnnotation' in page):
                fullTextAnnotation = page["fullTextAnnotation"]
                if('text' in fullTextAnnotation):
                    page_text = fullTextAnnotation['text']
                    ocr_text += page_text
                    ocr_pages.append(page_text)
        return ocr_text.strip(), ocr_pages
    
    except Exception as e:
        return "Error getting ocr from pdf: " + str(e)

In [None]:
schema = StructType(
    [
        StructField("ocr_text", StringType(), False),
        StructField("ocr_pages", ArrayType(StringType(), False), False)
    ]
)

read_completed_ocr = udf(read_completed_ocr, schema)

In [None]:
check_completion_df.show()

In [None]:
completion_df = check_completion_df.filter("status == 'done'")

In [None]:
completion_df.cache()

#### Get complete OCR text

In [None]:
fetch_ocr_df = check_completion_df.withColumn("ocr_output", read_completed_ocr(check_completion_df['ocr_text_path']))
ocr_df = fetch_ocr_df.select("pdf_path","ocr_output") \
                     .withColumn("ocr_text", fetch_ocr_df["ocr_output"]["ocr_text"]) \
                     .withColumn("ocr_pages", fetch_ocr_df["ocr_output"]["ocr_pages"]) \
                     .withColumn("number_pages", size(col("ocr_pages"))) \
                     .drop("ocr_output")

In [None]:
ocr_df.show(5,50)

In [None]:
ocr_df.cache()


|  pdf_path|  ocr_text| ocr_pages|number_pages|
|----------|----------|----------|------------|
|gs://da...|THIS AG...|[THIS A...|           8|
|gs://da...|Exhibit...|[Exhibi...|          40|
|gs://da...|Exhibit...|[Exhibi...|          44|
|gs://da...|Exhibit...|[Exhibi...|         100|
|gs://da...|TRANSPO...|[TRANSP...|          25|

## Summarize pages using Gemini API

In [None]:
def gemini_predict(prompt, temperature=0.5, model_name="gemini-1.5-pro"):
    
    from vertexai.generative_models import GenerativeModel, Part, Content, HarmCategory, HarmBlockThreshold

    model = GenerativeModel(model_name=model_name)
    
    prompt_content = Content(
        role="user",
        parts=[Part.from_text(prompt)]
    )

    response = model.generate_content(
        prompt_content,
        generation_config={
                "temperature": temperature,
                "response_mime_type": "text/x.enum"
        },
        safety_settings={
                HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_ONLY_HIGH
        }
    )
    
    return response.text

In [None]:
def summarize_text(page):
    
    prompt = f"""You an expert in reading contracts, articles, agreements, or text in general.
You are able to create concise summaries of the text provided to you.
Try your best to summarize the text even if the information is not so well understandable.
Here is an article I will ask you to summarize:
{page}
Provide a summary with about 3 sentences with the most important information from the text.
Summary:
"""
    
    summary = gemini_predict(prompt)
    return summary
    
generate_descriptions_udf = udf(summarize_text)

In [None]:
summarize_text = udf(summarize_text)

In [None]:
summaries_df = ocr_df.withColumn("summary", summarize_text(ocr_df["ocr_text"]))

In [None]:
summaries_df.show(5,50)

|                                          pdf_path|                                          ocr_text|                                         ocr_pages|number_pages|                                           summary|
|--------------------------------------------------|--------------------------------------------------|--------------------------------------------------|------------|--------------------------------------------------|
|gs://dataproc-metastore-public-binaries/cuad_v1...|EXECUTION COPY\nConfidential\nExhibit 10.18\nCE...|[EXECUTION COPY\nConfidential\nExhibit 10.18\nC...|          85|This Development and Option Agreement outlines ...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|Source: UPJOHN INC, 10-12G, 1/21/2020\nFORM OF\...|[Source: UPJOHN INC, 10-12G, 1/21/2020\nFORM OF...|          82|This Manufacturing and Supply Agreement outline...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|Exhibit 10.1\nCERTAIN CONFIDENTIAL PORTIONS OF ...|[Exhibit 10.1\nCERTAIN CONFIDENTIAL PORTIONS OF...|          71|This Network Build and Maintenance Agreement ou...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|Exhibit 10.2\nCERTAIN INFORMATION (INDICATED BY...|[Exhibit 10.2\nCERTAIN INFORMATION (INDICATED B...|          68|This is a Distributorship Agreement between Zog...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|Exhibit 10.12\n[***] Certain information in thi...|[Exhibit 10.12\n[***] Certain information in th...|          85|This Collaboration Agreement outlines the terms...|

Example: 

|page|
|----------|
|EXECUTION COPY\nConfidential\nExhibit 10.18\nCERTAIN CONFIDENTIAL INFORMATION CONTAINED IN THIS DOCUMENT, MARKED BY \*\*\*, HAS BEEN OMITTED BECAUSE IT IS BOTH NOT MATERIAL AND WOULD BE COMPETITIVELY\nHARMFUL IF PUBLICLY DISCLOSED.\nDEVELOPMENT AND OPTION AGREEMENT\nbetween\nHARPOON THERAPEUTICS, INC.\nand\nABBVIE BIOTECHNOLOGY LTD\nDated as of November 20, 2019\nSource: HARPOON THERAPEUTICS, INC., 10-K, 3/12/2020TABLE OF CONTENTS\nARTICLE 1\nDEFINITIONS\n1\nARTICLE 2\n18\nCOLLABORATION\nMANAGEMENT\n2.1\nJoint Governance Committee.\n2.2\n2.3\nDiscontinuation of the JGC.\n2.4\nGeneral Provisions Applicable to the JGC.\nInteractions Between the JGC and Internal Teams.\n18\n19\n20\n2.5\nCMC Working Group.\n2.6\nWorking Groups.\n2.7\nExpenses.\n21\n21\n21\nARTICLE 3\n21\nDEVELOPMENT\nAND\nREGULATORY\n3.1\n3.2\n3.3\n\*\*\*.\n3.4\n3.5\nInitial Development Plan and Activities.\nAbbVie Option.\n24\nPost-Exercise Development Activities.\nSupply of Technology for Development Purposes.\n21\n25\n3.6\n3.7\n3.8\nARTICLE 4\nExpenses and Invoicing.\nSubcontracting.\nRegulatory Matters.\n30\n26\n27\n28\n28\nCOMMERCIALIZATION\n4.1\n4.2\n4.3\n4.4\n4.5\nProducts.\nARTICLE 5\n33\nGRANT OF\nRIGHTS\nIn General.\nCommercialization Diligence.\nBooking of Sales; Distribution.\n30\n30\n31\n31\nProduct Trademarks.\nCommercial Supply of Licensed Compounds or Licensed\n31\n20\n27\n27\n5.1\nGrants to AbbVie.\n5.2\nGrants to Harpoon.\n5.3\nSublicenses.\n5.4\nDistributorships.\n5.5\nCo-Promotion Rights.\n5.6\nRetention of Rights.\n5.7\n5.8\n5.9\nConfirmatory Patent License.\nExclusivity with Respect to the Territory.\nIn-License Agreements.\n33\n34\n34\n34\n34\n34\n35\n35\n35\nARTICLE 6\n36\nPAYMENTS AND\nRECORDS\n6.1\nUpfront Payment.\n36\n6.2\n6.3\nDevelopment and Regulatory Milestones.\nFirst Commercial Sales Milestones.\n36\n37\n6.4\nSales-Based Milestones.\n37\n6.5\nRoyalties.\n38\n6.6\nRoyalty Payments and Reports.\n39\n6.7\nMode of Payment; Offsets.\n40\n6.8\nWithholding Taxes.\n40\nSource: HARPOON THERAPEUTICS, INC., 10-K, 3/12/202040\n41\n6.9\nIndirect Taxes.\n6.10\nInterest on Late Payments.\n6.11\nAudit.\n41\n6.12\nAudit Dispute.\n6.13\nConfidentiality.\n41\n41\n6.14\n\*\*\*\n41\n6.15\nNo Other Compensation.\nARTICLE 7\n42\nINTELLECTUAL\nPROPERTY\n42\n7.1\nOwnership of Intellectual Property.\n7.2\n7.3\n7.4\n7.5\n7.6\n7.7\n7.8\n7.9\nARTICLE 8\nMaintenance and Prosecution of Patents.\n42\n43\nEnforcement of Patents.\n45\nInfringement Claims by Third Parties.\n48\nInvalidity or unenforceability Defenses or Actions.\n48\nProduct Trademarks.\n49\nInternational Nonproprietary Name.\n50\nInventor's Remuneration.\n50\nCommon Interest.\n50\n50\nPHARMACOVIGILANCE\nAND SAFETY\n8.1\n8.2\nPharmacovigilance.\nGlobal Safety Database.\n50\n50\n50\nARTICLE 9\n51\nCONFIDENTIALITY\nAND NON-\nDISCLOSURE\n9.1\n9.2\n9.3\nProduct Information.\nConfidentiality Obligations.\nPermitted Disclosures.\n51\n51\n52\n2\n9.4\nUse of Name.\n53\n553\n9.5\nPublic Announcements.\n9.6\nPublications.\n53\n54\n9.7\n9.8\nReturn of Confidential Information.\nSurvival.\n54\n54\nARTICLE 10\n55\nREPRESENTATIONS\nAND WARRANTIES\n10.1\n10.2\n10.3\n10.4\n10.5\nMutual Representations and Warranties.\n55\nAdditional Representations and Warranties of Harpoon.\nCovenants of Harpoon.\n58\nCovenants of AbbVie.\n58\nDISCLAIMER OF WARRANTIES.\n59\nARTICLE 11\n60\nINDEMNITY\n11.1\n11.2\nIndemnification of Harpoon.\nIndemnification of AbbVie.\n11.3\n11.4\n11.5\n11.6\n60\n66\n60\nNotice of Claim.\n60\nControl of Defense.\n61\nSpecial, Indirect, and Other Losses.\n61\nInsurance.\n61\nARTICLE 12\n62\nTERM AND\nTERMINATION\n12.1\n- ii -\nTerm.\n62\nSource: HARPOON THERAPEUTICS, INC., 10-K, 3/12/2020\n55\n5512.2\n12.3\nTermination for Material Breach.\nAdditional Termination Rights by AbbVie.\n12.4\nTermination for Insolvency.\n12.5\nRights in Bankruptcy.\n12.6\nTermination in Entirety.\n12.7\nReversion of Harpoon Products.\n12.8\n12.9\n12.10\nTermination of Terminated Territory.\nRemedies.\nAccrued Rights; Surviving Obligations.\n67\n62\n63\n63\n63\n66\n67\n67\n12\n63\n63\nARTICLE 13\n68\nMISCELLANEOUS\n13.1\nForce Majeure.\n68\n13.2\nChange in Control of Harpoon.\n68\n13.3\nExport Control.\n69\n13.4\nAssignment.\n69\n13.5\nSeverability.\n70\n13.6\nGoverning Law, Jurisdiction and Service.\n70\n13.7\nDispute Resolution.\n70\n13.8\nNotices.\n71\n13.9\nEntire Agreement; Amendments.\n72\n13.10\nEnglish Language.\n72\n13.11\nEquitable Relief.\n72\n13.12\nWaiver and Non-Exclusion of Remedies.\n72\n13.13\nNo Benefit to Third Parties.\n72\n13.14\nFurther Assurance.\n73\n13.15\nRelationship of the Parties.\n13.16\nPerformance by Affiliates.\n73\nWW\n73\n13.17\nCounterparts; Facsimile Execution.\n73\n13.18\nReferences.\n73\n13.19\nSchedules.\n73\n13.20\nSCHEDULES\nSchedule 1.84\nSchedule 1.99\nSchedule 3.7\nSchedule 10.2\nSchedule 10.2.1\nSchedule 13.7.3\nConstruction.\nInitial Development Plan\nLicensed Compound\nPre-Approved Third Party Providers\nDisclosure Schedules\nExisting Patents\nArbitration\n73\n- 111 -\nSource: HARPOON THERAPEUTICS, INC., 10-K, 3/12/2020DEVELOPMENT AND OPTION AGREEMENT\nThis Development and Option Agreement (the "Agreement") is made and entered into effective as of\nNovember 20, 2019 (the "Effective Date") by and between Harpoon Therapeutics, Inc., a Delaware corporation ("Harpoon”), and\nAbbVie Biotechnology Ltd, a Bermuda corporation (“AbbVie”). Harpoon and AbbVie are sometimes referred to herein individually\nas a "Party" and collectively as the "Parties."\nRECITALS\nWHEREAS, Harpoon Controls (as defined herein) certain intellectual property rights with respect to the\nLicensed Compound (as defined herein) and Licensed Products (as defined herein) in the Territory (as defined herein); and\nWHEREAS, Harpoon wishes to grant an option to a license to AbbVie, and AbbVie wishes to take, such option\nto a license under such intellectual property rights to develop and commercialize Licensed Products in the Territory, in each case in\naccordance with the terms and conditions set forth below.....................|

|summary|
|----------|
|This Development and Option Agreement outlines the collaboration between Harpoon Therapeutics, Inc. and AbbVie Biotechnology Ltd for the development and commercialization of a compound known as HPN217. The agreement grants AbbVie an exclusive option to license the compound after reviewing the results of a Phase I/IB trial conducted by Harpoon. Upon exercising the option, AbbVie will take over development and commercialization responsibilities, with Harpoon receiving milestone payments and royalties on net sales. |

## Save to BigQuery

In [None]:
agreggated_df = summaries_df.groupby("pdf_path") \
                           .agg(collect_list("summary").alias("page_summary_list"))

In [None]:
summaries_df.write \
            .format("bigquery") \
            .option("table", project_id + ":" + output_dataset_bq + "." + output_table_bq) \
            .option("temporaryGcsBucket", bq_temp_bucket_name) \
            .option("enableListInference", True) \
            .mode("overwrite") \
            .save()