In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Zachary Thorman](https://github.com/zthor5)|

# Overview

This Notebook will generate JSONLs & Training splits for Finetuning from a list of PDF's using Generative AI's full context to do analysis of the PDFs.

# Getting Started

In this section, you will install needed dependencies & define the Google Cloud project where you want to connect to Vertex AI.

### Install dependencies

In [None]:
!pip install --upgrade --quiet google-generativeai chromadb pymupdf google-cloud-storage langchain==0.1.20

Then import the modules you'll use in this tutorial.

In [None]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
import pymupdf
import re
import time

from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from vertexai.language_models import TextEmbeddingModel

# Used to securely store your API key
from google.colab import userdata
from google.cloud import storage

from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings

# Import LangChain components
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader


# Initialize Vertex AI
import vertexai
import sys

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [None]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

### Define Google Cloud project information, initialize Vertex AI, and add Secrets

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Utilizing Secrets to retrieve sensitive information
# You can add your own projectID and location to run in your environment.

PROJECT_ID = userdata.get('ProjectId') # @param {type:"string"}
LOCATION = "us-central1"    # @param {type:"string"}


vertexai.init(project=PROJECT_ID, location=LOCATION)

# Helper Functions for Creating Fine Tuning Data

## Generating JSONLs

Note: *Markdown is currently lost in this conversion.*

In [112]:
# Pauses execution on GCP for 12 second due to default Quota for Vertex AI
defualt_quota_sec = 15

# Create a text splitter to divide documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=200,
)

# Creating the Generation Config
generation_config = {
"max_output_tokens": 8192,
"temperature": 0,
"top_p": 0.95,
}

# Defining Safety filters that WILL NOT block (hopefully) the content outputted
safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}


In [113]:
def download_pdfs_to_local(pdfList):
  local_pdfs = []
  for pdf in pdfList:
    ! gsutil -q cp {pdf} .
    print(f'Downloaded: {pdf}')
    local_pdfs.append(re.split(r'([^\/]+$)', pdf)[-2]) # *zthor* Potentially change to use a simple split then backwards trace the list.
  return local_pdfs


def pdf_context_jsonl(local_pdfList, model):
  all_pdfs_text = ""
  for pdf in local_pdfList:
    with pymupdf.open(pdf) as doc:
      for page in doc:
          all_pdfs_text += page.get_text()

  # Split the text into chunks
  chunks = text_splitter.split_text(all_pdfs_text)
  with open('context.jsonl', 'w') as f:
      for i, chunk in enumerate(chunks):
        chunk = chunk.replace("\n"," ")
        chunk = chunk.replace("\"","\'")
        try:
          title = model.generate_content(f"Generate a 10 word summary of this text: {chunk}")
          if i%10 == 0:
            print(f'Processing LLM calls for chunks ({i} - {i + 10})')
        except Exception as err:
          print(f"LLM's need breaks too! Paused execution for default quota: {defualt_quota_sec} seconds.")
          time.sleep(defualt_quota_sec)
          try:
            title = model.generate_content(f"Generate a 10 word summary of this text: {chunk}")
            print(f'Finished LLM call for chunk ({i})')
          except Exception as err:
            title = "null"
            raise Exception(f"Oh, maybe something else was the issue? Here is the error: {err}")
        cleansed_title = title.text.replace("\n","")
        f.write(f'{{"_id":"context_{i}","title":"{cleansed_title}","text":"{chunk}"}}\n')
  print(f'Finished generating Context.jsonl')
  return all_pdfs_text


def validate_jsonl_llm(jsonl_text, model):
  prompt_analysis = f"Return 'true' or 'false' based on if the following text is properly formatted and is valid JSONL that would not generate errors:\n{jsonl_text}"
  response = model.generate_content(prompt_analysis)
  print(f'validate_jsonl_llm model response: {response.text}')
  return True if ("true" in response.text.lower()) else False


# Creates JSONL Prompts for a PDF and writes them into a file
# *zthor* Modify Prompt to generate reliably at least 10 to 50 Per PDF
def pdf_query_jsonl(gcs_pdfList, model):
  validate_text = ""
  prompt = 'Output in JSONL up to 10 questions that can be answered based on the content of the pdf provided. Output only in JSONL format using this template: {"_id":"query_[An Iterable number]","text":"[A question based on the pdf provided]"}'
  with open('query.jsonl', 'w') as f:
    for pdf in gcs_pdfList:
      pdf_file = Part.from_uri(pdf, mime_type="application/pdf")
      output = model.generate_content([prompt,pdf_file])
      f.write(output.text)
      validate_text += output.text

  is_validated = validate_jsonl_llm(validate_text,model)
  if (is_validated):
    print(f"Validate_jsonl_llm returned: {is_validated}")
    f.close()
    return "Successful Creation of Prompt.jsonl"
  else:
    if 'yes' in input('Failed Creation of Prompt.jsonl; Reattempt creation? (yes or no): ').lower():
      f.close()
      pdf_query_jsonl(gcs_pdfList, model)
    else:
      f.close()
      return "Failed Creation of Prompt.jsonl"

def create_pairing_tsv(pdfList, pdf_text, model):
  all_pairs = [] # TSV of: [query-id], [context-id], [score]
  with open('query.jsonl', 'r') as f:
   for line in f:
    print(f'Processing LLM calls for line: {line}')



#f.read()




For More details on what is needed for Fine Tuning, [learn more here!](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#dataset-format)

In [114]:
model = GenerativeModel("gemini-1.5-pro-preview-0514", generation_config = generation_config, safety_settings=safety_settings)

gcs_pdfList =  ["gs://dmv-pdf-analysis/driver_manual_ga_2024.pdf",
            "gs://dmv-pdf-analysis/commercial_driver_guide.pdf",
            "gs://dmv-pdf-analysis/motorcycle_operator_guide.pdf",
            "gs://dmv-pdf-analysis/40_hour_teen_driving_guide.pdf",
            "gs://dmv-pdf-analysis/alcohol_drug_awareness_student.pdf",
] # Needs to be stored in a GCS Bucket

local_pdf_list = download_pdfs_to_local(gcs_pdfList)

all_pdf_text = pdf_context_jsonl(local_pdf_list, model)
print(pdf_query_jsonl(gcs_pdfList, model))
create_pairing_tsv(local_pdf_list, all_pdf_text, model)

Downloaded: gs://dmv-pdf-analysis/driver_manual_ga_2024.pdf
Downloaded: gs://dmv-pdf-analysis/commercial_driver_guide.pdf
Downloaded: gs://dmv-pdf-analysis/motorcycle_operator_guide.pdf
Downloaded: gs://dmv-pdf-analysis/40_hour_teen_driving_guide.pdf
Downloaded: gs://dmv-pdf-analysis/alcohol_drug_awareness_student.pdf
Processing LLM calls for chunks (0 - 10)
Processing LLM calls for chunks (10 - 20)
Processing LLM calls for chunks (20 - 30)
Processing LLM calls for chunks (30 - 40)
Processing LLM calls for chunks (40 - 50)
Processing LLM calls for chunks (50 - 60)
Processing LLM calls for chunks (60 - 70)
Processing LLM calls for chunks (70 - 80)
LLM's need breaks too! Paused execution for default quota: 15 seconds.


Exception: Oh, maybe something else was the issue? Here is the error: 429 Quota exceeded for aiplatform.googleapis.com/generate_content_requests_per_minute_per_project_per_base_model with base model: gemini-1.5-pro. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai.

# Next steps

To learn more about how you can use the embeddings, check out the [examples](https://ai.google.dev/examples?keywords=embed) available. To learn how to use other services in the Gemini API, visit the [Python quickstart](https://ai.google.dev/gemini-api/docs/get-started/python).