In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Summarize contracts (PDF files) using Spark and Gemini

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/summarization/pdf_contracts_summarization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/summarization/pdf_contracts_summarization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/generative_ai/summarization/pdf_contracts_summarization.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/summarization/pdf_contracts_summarization.ipynb">
      <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35">
      Open in BQ Studio
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks%2Fgenerative_ai%2Fsummarization%2Fpdf_contracts_summarization.ipynb">
    <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo">
    Open in Colab Enterprise
    </a>
  </td>

</table>

## Overview

This notebook shows how to perform summarization using Gemini for a large number of contract PDF files in a GCS bucket

#### **Steps**
Using Spark, 
1) It reads the table of the [Contract Understanding Atticus Dataset (CUAD)](https://www.atticusprojectai.org/cuad) dataset located in the [gs://dataproc-metastore-public-binaries/cuad_v1/full_contract_pdf/](https://console.cloud.google.com/storage/browser/dataproc-metastore-public-binaries/cuad_v1)  
   We will create a metadata table poiting to the paths of the image files in the bucket.  
3) It calls [Vertex AI Gemini API](https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/api-quickstart#try_text_prompts) to summarize the text.
4) It saves the output to BigQuery

#### Related content

- [Design summarization prompts](https://cloud.google.com/vertex-ai/docs/generative-ai/text/summarization-prompts)

## Setup

#### Identity and Access Management (IAM)

Make sure the service account running this notebook has the required permissions:

- **Run the notebook**
  - AI Platform Notebooks Service Agent
  - Notebooks Admin
  - Vertex AI Administrator
- **Read files from bucket**
  - Storage Object Viewer
- **Run Dataproc jobs**
  - Dataproc Service Agent
  - Dataproc Worker
- **Call Google APIs (Gemini)**
  - Service Usage Consumer
  - VisionAI Admin
- **BigQuery**
  - BigQuery Data Editor

#### Imports

In [None]:
from pyspark.sql.functions import udf

import google.auth
import google.auth.transport.requests
import requests

In [None]:
# When using Dataproc Serverless, installed packages are automatically available on all nodes
!pip3 install --upgrade -q google-cloud-aiplatform google-genai "protobuf~=4.25.3" "numpy~=1.26.4" 
# When using a Dataproc cluster, you will need to install these packages during cluster creation: https://cloud.google.com/dataproc/docs/tutorials/python-configuration

#### Authentication

In [None]:
# Get credentials to authenticate with Google APIs
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

#### Setup Spark Session

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("PDF files summarization using Gemini") \
    .enableHiveSupport() \
    .getOrCreate()

#### Parameters

In [None]:
# Change the maximum number of files you want to consider
limit_files = 5
# BigQuery
output_dataset_bq = "output_dataset" # create the BigQuery dataset beforehand
output_table_bq = "summaries"

## Read dataset

#### Read CUAD V1 dataset from metastore

In [None]:
BINARIES_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/cuad_v1/full_contract_pdf/"
cuad_v1_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(BINARIES_BUCKET_PATH).limit(limit_files)

|                path|    modificationTime| length|             content|
|--------------------|--------------------|-------|--------------------|
|gs://dataproc-met...|2023-05-15 20:53:...|3683550|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:53:...|2881262|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:54:...|1778356|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:53:...|1557129|[25 50 44 46 2D 3...|
|gs://dataproc-met...|2023-05-15 20:53:...|1452180|[25 50 44 46 2D 3...|

## Summarize pages using Gemini API

In [None]:
def gemini_predict(gcs_pdf_uri, model_name="gemini-2.0-flash", max_retries=3, initial_delay=1):
    
    import time
    from google import genai
    from google.genai import types
    
    client = genai.Client(
        vertexai=True,
        project=project_id,
        location="us-central1"
    )
    
    generate_content_config = types.GenerateContentConfig(
        response_mime_type = "text/plain"
    )

    contents = [
        types.Part.from_uri(
            file_uri=gcs_pdf_uri,
            mime_type='application/pdf',
        ),
        """ You an expert in reading contracts, articles, agreements, or text in general.
            You are able to create concise summaries of the text provided to you.
            Provide a summary about the attached pdf with about 3 sentences with the most important information from the text.
            Summary:
        """
  ]
    
    retries, delay = 0, initial_delay
    while retries <= max_retries:
        try:
            response = client.models.generate_content(model=model_name,
                                                      contents=contents,
                                                      config=generate_content_config)
            
            return response.text
        except Exception:
            if retries == max_retries:
                return
            time.sleep(delay)
            delay *= 2
            retries += 1
    return ""

In [None]:
summarize_text = udf(gemini_predict)

In [None]:
summaries_df = cuad_v1_df.withColumn("summary", summarize_text(cuad_v1_df["path"]))

In [None]:
summaries_df.show(5,50)

|                                              path|       modificationTime| length|                                           content|                                           summary|
|--------------------------------------------------|-----------------------|-------|--------------------------------------------------|--------------------------------------------------|
|gs://dataproc-metastore-public-binaries/cuad_v1...|2023-05-15 20:53:55.891|3683550|[25 50 44 46 2D 31 2E 34 0A 25 E2 E3 CF D3 0A 3...|Here is a summary of the provided document:\n\n...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|2023-05-15 20:53:57.195|2881262|[25 50 44 46 2D 31 2E 35 0A 25 E2 E3 CF D3 0A 0...|This document is a promotion and distribution a...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|2023-05-15 20:54:00.609|1778356|[25 50 44 46 2D 31 2E 35 0A 25 E2 E3 CF D3 0A 0...|This document is a strategic alliance agreement...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|2023-05-15 20:53:57.902|1557129|[25 50 44 46 2D 31 2E 35 0A 25 E2 E3 CF D3 0A 0...|This PDF is a collaboration agreement between t...|
|gs://dataproc-metastore-public-binaries/cuad_v1...|2023-05-15 20:53:57.659|1452180|[25 50 44 46 2D 31 2E 34 0D 25 C8 C8 C8 C8 C8 C...|This is a Transportation Services Agreement bet...|

## Save to BigQuery

In [None]:
summaries_df.write \
            .format("bigquery") \
            .option("table", f"{project_id}.{output_dataset_bq}.{output_table_bq}") \
            .option("writeMethod", "direct") \
            .mode("overwrite") \
            .save()