In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Zachary Thorman](https://github.com/zthor5)|

# Overview

This Notebook will generate JSONLs & Training splits for Finetuning from a list of PDF's using Generative AI's full context to do analysis of the PDFs.

# Getting Started

In this section, you will install needed dependencies & define the Google Cloud project where you want to connect to Vertex AI.

### Install dependencies

In [None]:
!pip install --upgrade --quiet google-generativeai chromadb pymupdf google-cloud-storage langchain==0.1.20

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.5/126.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

Then import the modules you'll use in this tutorial.

In [14]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
import pymupdf
import re
import time
import json
import os

from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from vertexai.language_models import TextEmbeddingModel

# Used to securely store your API key
from google.colab import userdata
from google.cloud import storage

from IPython.display import Markdown, HTML, display
from chromadb import Documents, EmbeddingFunction, Embeddings

# Import LangChain components
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader


# Initialize Vertex AI
import vertexai
import sys

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [None]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

### Define Google Cloud project information, initialize Vertex AI, and add Secrets

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Utilizing Secrets to retrieve sensitive information
# You can add your own projectID and location to run in your environment.

PROJECT_ID = userdata.get('ProjectId') # @param {type:"string"}
LOCATION = "us-central1"    # @param {type:"string"}


vertexai.init(project=PROJECT_ID, location=LOCATION)

# Helper Functions for Creating Fine Tuning Data

## Generating JSONLs

Note: *Markdown is currently lost in this conversion.*

In [22]:
# Pauses execution on GCP for 12 second due to default Quota for Vertex AI
defualt_quota_sec = 20

# Create a text splitter to divide documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=200,
)

# Creating the Generation Config
generation_config = {
"max_output_tokens": 8192,
"temperature": 0,
"top_p": 0.95,
}

# Defining Safety filters that WILL NOT block (hopefully) the content outputted
safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}


In [43]:
def progress(value =1, max =1):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 60%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))


def download_pdfs_to_local(pdfList):
  local_pdfs = []
  for pdf in pdfList:
    ! gsutil -q cp {pdf} .
    print(f'Downloaded: {pdf}')
    local_pdfs.append(re.split(r'([^\/]+$)', pdf)[-2]) # *zthor* Potentially change to use a simple split then backwards trace the list.
  return local_pdfs


def pdf_context_jsonl(local_pdfList, model):
  all_pdfs_text = ""
  for pdf in local_pdfList:
    with pymupdf.open(pdf) as doc:
      for page in doc:
          all_pdfs_text += page.get_text()

  # Split the text into chunks
  chunks = text_splitter.split_text(all_pdfs_text)
  print(f'Now proccessing ({len(chunks)}) chunks through LLM...')
  with open('context.jsonl', 'w') as f:
    display_out = display(progress(0, len(chunks)), display_id=True)
    for i, chunk in enumerate(chunks):
      chunk = chunk.replace("\n","")
      chunk = chunk.replace('"','\\\"')
      try:
        title = model.generate_content(f"Generate a 10 word summary of this text: {chunk}")
        display_out.update(progress(i, len(chunks)))
      except Exception as err:
        print(f"LLM's need breaks too! Paused execution for default quota: {defualt_quota_sec} seconds.")
        time.sleep(defualt_quota_sec)
        try:
          title = model.generate_content(f"Generate a 10 word summary of this text: {chunk}")
          print(f'Finished LLM call for chunk ({i})')
        except Exception as err:
          title = "null"
          raise Exception(f"Oh, maybe something else was the issue? Here is the error: {err}")
      cleansed_title = title.text.replace("\n","").replace('"','\\"')
      f.write(f'{{"_id":"context_{i}","title":"{cleansed_title}","text":"{chunk}"}}\n')

    display_out.update(progress())
    print(f'Finished generating Context.jsonl')
    f.close()
  return all_pdfs_text

# *zthor* Later validate via JSONLines
def validate_jsonl(jsonl_text, model):
  for line in jsonl_text.splitlines():
    try:
      json_line = json.loads(line)
    except Exception as err:
      print(f"Invalid JSON Line: {line}")
      return False
  return True


# Creates JSONL Prompts for a PDF and writes them into a file
# *zthor* Modify Prompt to generate reliably at least 10 to 50 Per PDF
def pdf_query_jsonl(gcs_pdfList, model):
  print('Creating prompt.jsonl [May take a few minutes..]')
  validate_text = ""
  with open('query.jsonl', 'w') as f:
    display_out = display(progress(0, len(gcs_pdfList)), display_id=True)
    for x, pdf in enumerate(gcs_pdfList):
      pdf_file = Part.from_uri(pdf, mime_type="application/pdf")
      prompt = f'Output in JSONL up to 10 questions that can be answered based on the content of the pdf provided. Output only in JSONL format using this template: {{"_id":"query_{x}_[An iterable number starting with 0]","text":"[A question based on the pdf provided]"}}'
      output = model.generate_content([prompt,pdf_file])
      # For loop through new line delimiters of prompts, then clean the Strings for errors
      if x+1 == len(gcs_pdfList):
        f.write(f'{output.text}')
      else:
        f.write(f'{output.text}\n')
      validate_text += output.text
      display_out.update(progress(x, len(gcs_pdfList)))

  is_validated = validate_jsonl(validate_text,model)
  if (is_validated):
    display_out.update(progress())
    print(f"Validate_jsonl_llm returned: {is_validated}")
    f.close()
    return "Successful creation of prompt.jsonl"
  else:
    if 'yes' in input('Failed creation of prompt.jsonl; Reattempt creation? (yes or no): ').lower():
      f.close()
      pdf_query_jsonl(gcs_pdfList, model)
    else:
      f.close()
      display_out.update(progress())
      return "Potential failed creation of prompt.jsonl"


def create_pairing_tsv(pdfList, pdf_text, model):
  all_pairs = [["query-id","corpus-id","score"]]
  with open('query.jsonl', 'r') as f:
   for query_line in f:
    try:
      query_json_line = json.loads(query_line)
    except Exception as err:
      raise Exception(f"Some invalid JSON slipped into the query.jsonl!\nInvalid JSON [{query_line}]\n Here is the error: {err}")
    with open('context.jsonl', 'r') as g:
      for context_line in g:
        try:
          context_json_line = json.loads(context_line)
        except Exception as err:
          raise Exception(f"Some invalid JSON slipped into the context.jsonl!\nInvalid JSON [{context_line}]\n Here is the error: {err}")
        query = query_json_line["text"]
        content = context_json_line["text"]
        prompt = f"Respond with only an integer that describes how well the context answers the question. The integer can be from 1 (Does not contatin any relevant information to answer the question) to 10 (Directly has information to answer the question).\nThe question: {query}\nThe context: {content}"
        response = model.generate_content(prompt)
        response_only_int = int(re.search(r'\d+', response.text))
        pairing = [query_json_line["_id"], context_json_line["_id"], response_only_int]
        print(pairing)
        all_pairs.append(pairing)
        time.sleep(1)
  # Shuffle before TSV file creation
    #g.close()



#f.read()




For More details on what is needed for Fine Tuning, [learn more here!](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#dataset-format)

In [None]:
model = GenerativeModel("gemini-1.5-pro-preview-0514", generation_config = generation_config, safety_settings=safety_settings)

# *zthor* Change to only accept bucket location & iterate through all objects in bucket (no sub folders). WARN User that must be PDF's
gcs_pdfList =  ["gs://dmv-pdf-analysis/driver_manual_ga_2024.pdf",
            "gs://dmv-pdf-analysis/commercial_driver_guide.pdf",
            "gs://dmv-pdf-analysis/motorcycle_operator_guide.pdf",
            "gs://dmv-pdf-analysis/40_hour_teen_driving_guide.pdf",
            "gs://dmv-pdf-analysis/alcohol_drug_awareness_student.pdf",
] # Needs to be stored in a GCS Bucket

local_pdf_list = download_pdfs_to_local(gcs_pdfList)

all_pdf_text = pdf_context_jsonl(local_pdf_list, model)
pdf_query_jsonl(gcs_pdfList, model)
create_pairing_tsv(local_pdf_list, all_pdf_text, model)

Caught CTRL-C (signal 2) - exiting
Traceback (most recent call last):
  File "/tools/google-cloud-sdk/platform/gsutil/gsutil", line 21, in <module>
    gsutil.RunMain()
  File "/tools/google-cloud-sdk/platform/gsutil/gsutil.py", line 152, in RunMain
    sys.exit(gslib.__main__.main())
  File "/tools/google-cloud-sdk/platform/gsutil/gslib/__main__.py", line 436, in main
    return _RunNamedCommandAndHandleExceptions(
  File "/tools/google-cloud-sdk/platform/gsutil/gslib/__main__.py", line 786, in _RunNamedCommandAndHandleExceptions
    _HandleUnknownFailure(e)
  File "/tools/google-cloud-sdk/platform/gsutil/gslib/__main__.py", line 633, in _RunNamedCommandAndHandleExceptions
    return command_runner.RunNamedCommand(command_name,
  File "/tools/google-cloud-sdk/platform/gsutil/gslib/command_runner.py", line 421, in RunNamedCommand
    return_code = command_inst.RunCommand()
  File "/tools/google-cloud-sdk/platform/gsutil/gslib/commands/cp.py", line 1116, in RunCommand
    self.stats_loc

# Next steps

To learn more about how you can use the embeddings, check out the [examples](https://ai.google.dev/examples?keywords=embed) available. To learn how to use other services in the Gemini API, visit the [Python quickstart](https://ai.google.dev/gemini-api/docs/get-started/python).