In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Zachary Thorman](https://github.com/zthor5)|

# Overview

This Notebook will generate JSONLs & Training splits for Finetuning Embeddings from a list of PDFs using Generative AI's full context to do analysis of the PDFs.

# Getting Started

In this section, you will install needed dependencies & define the Google Cloud project where you want to connect to Vertex AI.

### Install dependencies

In [1]:
!pip install --upgrade --quiet google-generativeai chromadb pymupdf google-cloud-storage langchain==0.1.20

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.3/584.3 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[

Then import the modules you'll use in this tutorial.

In [2]:
import textwrap
import chromadb
import random
import numpy as np
import pandas as pd
import pymupdf
import re
import time
import datetime
import json
import os

from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from vertexai.language_models import TextEmbeddingModel

# Used to securely store your API key
from google.colab import userdata
from google.cloud import storage

from IPython.display import Markdown, HTML, display
from chromadb import Documents, EmbeddingFunction, Embeddings

# Import LangChain components
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader


# Initialize Vertex AI
import vertexai
import sys

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [3]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [4]:
# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

### Define Google Cloud project information, initialize Vertex AI, and add Secrets

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [5]:
# Utilizing Secrets to retrieve sensitive information
# You can add your own projectID and location to run in your environment.

PROJECT_ID = userdata.get('ProjectId') # @param {type:"string"}
LOCATION = "us-central1"    # @param {type:"string"}


vertexai.init(project=PROJECT_ID, location=LOCATION)

# Helper Functions for Creating Fine Tuning Data

## Generating JSONLs

Note: *Markdown is currently lost in this conversion.*

In [6]:
# Pauses execution on GCP for 12 second due to default Quota for Vertex AI. Determines amount of reattempts
defualt_quota_sec = 12
default_attempts = 5

# Defines Training Splits for Training, Validation, & Testing.tsv
TRAINING_SPLIT = 0.8
VALIDATION_SPLIT = 0.1
TESTING_SPLIT = 0.1

# Create a text splitter to divide documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=200,
)

# Creating the Generation Config
generation_config = {
"max_output_tokens": 8192,
"temperature": 0,
"top_p": 0.95,
}

# Defining Safety filters that WILL NOT block (hopefully) the content outputted
safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}


In [31]:
# Pass The folder path for storing the images
def create_clean_folders(PDF_Path):
  # Create the directory if it doesn't exist
  if not os.path.exists(PDF_Path):
    os.makedirs(PDF_Path)
  pdf_star = PDF_Path + "*"
  !rm -rf {pdf_star}

  if not os.path.exists("./output/"):
    os.makedirs("./output/")
  pdf_star = "./output/" + "*"
  !rm -rf {pdf_star}

def update_text(text = "default text"):
    return HTML("""
        <p>{}</p>
    """.format(text))

def progress(value =1, max =1):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 60%'>
            {value}
        </progress>
    """.format(value=value, max=max))


def download_bucket_to_local(bucket_uri, local_folder):
  gcs_uri_list = []
  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_uri)
  blobs = bucket.list_blobs()
  display_out_text = display(update_text("Starting Download of PDF's..."), display_id=True)
  for blob in blobs:
    file_path = local_folder + blob.name
    blob.download_to_filename(file_path)
    gcs_uri_list.append("gs://" + bucket_uri + "/" + blob.name)
    display_out_text.update(update_text(f"Downloaded: {blob.name}"))
  display_out_text.update(update_text(f"Downloaded all PDFs!"))
  return gcs_uri_list


def pdf_context_jsonl(pdf_folder, model):
  all_pdfs_text = ""
  for pdf in os.listdir(pdf_folder):
    with pymupdf.open(pdf_folder + pdf) as doc:
      for page in doc:
          all_pdfs_text += page.get_text()

  # Split the text into chunks
  chunks = text_splitter.split_text(all_pdfs_text)
  print(f'Now proccessing ({len(chunks)}) chunks through LLM...')
  with open('./output/context.jsonl', 'w') as f:
    display_out = display(progress(0, len(chunks)), display_id=True)
    for i, chunk in enumerate(chunks):
      chunk = chunk.replace("\n","")
      chunk = chunk.replace('"','\\\"')
      is_quota_too_low = 0
      current_attmepts = 0
      while current_attmepts < default_attempts:
        try:
          title = model.generate_content(f"Generate a 10 word summary of this text: {chunk}")
          current_attmepts = default_attempts + 1
        except Exception as err:
          if is_quota_too_low > default_attempts * 5:
            raise Exception("Cancelling Process. Please either:\n - Increase [defualt_quota_sec] to allow for longer pauses between API calls\n - Request a Quota Increase request for your API")
          print(f"LLM's need breaks too! Reattempting {current_attmepts+1}/{default_attempts}. Pausing for {defualt_quota_sec} seconds.")
          current_attmepts += 1
          is_quota_too_low += 1
          time.sleep(defualt_quota_sec)

      display_out.update(progress(i, len(chunks)))
      cleansed_title = title.text.replace("\n","").replace('"','\\"')
      f.write(f'{{"_id":"context_{i}","title":"{cleansed_title}","text":"{chunk}"}}\n')

    display_out.update(progress())
    print(f'Finished generating context.jsonl')
    f.close()
  return all_pdfs_text

def validate_jsonl(jsonl_text, model):
  for line in jsonl_text.splitlines():
    try:
      json_line = json.loads(line)
    except Exception as err:
      print(f"Invalid JSON Line: {line}")
      return False
  return True


# Creates JSONL Prompts for a PDF and writes them into a file
def pdf_query_jsonl(gcs_pdfList, model):
  print('Creating query.jsonl [May take a few minutes..]')
  num_of_aprox_queries = 800
  validate_text = ""
  with open('./output/query.jsonl', 'w') as f:
    display_out = display(progress(0, len(gcs_pdfList)), display_id=True)
    for x, pdf in enumerate(gcs_pdfList):
      pdf_file = Part.from_uri(pdf, mime_type="application/pdf")
      num_of_queries_per_pdf = int(num_of_aprox_queries / len(gcs_pdfList)) # Rough number of Queries per PDF: Around ~500
      prompt = f'Output up to {num_of_queries_per_pdf} questions that can be answered based on the content of the pdf provided. Output each question on a new line and utilize the seven question words to create various styles of questions.'
      output = model.generate_content([prompt,pdf_file])
      for i, line in enumerate(output.text.splitlines()):
        clean_output = line.replace("\n","").replace('"','\\"')
        if x+1 == len(gcs_pdfList) and i+1 == len(output.text.splitlines()):
          clean_json = f'{{"_id":"query_{x}_{i}","text":"{clean_output}"}}'
          f.write(clean_json)
          validate_text += clean_json
        else:
          clean_json = f'{{"_id":"query_{x}_{i}","text":"{clean_output}"}}\n'
          f.write(clean_json)
          validate_text += clean_json
        display_out.update(progress(x, len(gcs_pdfList)))

  is_validated = validate_jsonl(validate_text,model)
  if (is_validated):
    display_out.update(progress())
    print(f"Validate_jsonl_llm returned: {is_validated}")
    f.close()
    return "Successful creation of query.jsonl"
  else:
    if 'yes' in input('Failed creation of prompt.jsonl; Reattempt creation? (yes or no): ').lower():
      f.close()
      pdf_query_jsonl(gcs_pdfList, model)
    else:
      f.close()
      display_out.update(progress())
      return "Potential failed creation of prompt.jsonl"


def create_pairing_file(pair_list, num_of_pairs, tsv_file_loc):
  with open(tsv_file_loc, 'w') as f:
    f.write("query-id\tcorpus-id\tscore\n")
    for i in range(num_of_pairs):
      random_index = random.randint(0, len(pair_list)-1)
      random_element = pair_list.pop(random_index)
      f.write(f"{random_element[0]}\t{random_element[1]}\t{random_element[2]}")
      if i != len(pair_list)-1:
        f.write("\n")
    f.close()
  return pair_list


# Requires JSONLs to already have been created
def create_pairing_tsv(pdf_text, model):
  z_progress = 0
  runs_for_eta = 500
  time_left_list = [0] * runs_for_eta # Uses the last runs as average to determine ETA
  total_lines = sum(1 for _ in open('./output/query.jsonl')) * sum(1 for _ in open('./output/context.jsonl'))
  print('Creating pairing.tsv via [DEEP ANALYSIS] [This will take a couple hours..]') # 5 PDFs / 100 Prompts => 3.5 Hours
  display_out = display(progress(0, 1), display_id=True)
  display_out_text = display(update_text("Currently running..."), display_id=True)
  all_pairs = []
  with open('./output/query.jsonl', 'r') as f:
   for query_line in f:
    try:
      query_json_line = json.loads(query_line)
    except Exception as err:
      raise Exception(f"Some invalid JSON slipped into the query.jsonl!\nInvalid JSON [{query_line}]\n Here is the error: {err}")
    with open('./output/context.jsonl', 'r') as g:
      for context_line in g:
        z_progress += 1
        try:
          context_json_line = json.loads(context_line)
        except Exception as err:
          raise Exception(f"Some invalid JSON slipped into the context.jsonl!\nInvalid JSON [{context_line}]\n Here is the error: {err}")
        query = query_json_line["text"]
        content = context_json_line["text"]
        prompt = f"Respond with only an integer that describes how well the context answers the question. The integer can be from 1 (Does not contatin any relevant information to answer the question) to 10 (Directly has information to answer the question).\nThe question: {query}\nThe context: {content}"
        is_quota_too_low = 0
        current_attmepts = 0
        eta = "not available yet"
        if (0 not in time_left_list):
          total_seconds_left = sum(time_left_list) / runs_for_eta * (total_lines - z_progress)
          avg_per_call = sum(time_left_list) / runs_for_eta
          hours_left = int(total_seconds_left/3600)
          minutes_left = int((total_seconds_left/60) % 60)
          eta = f"{hours_left}h:{minutes_left}m:{int(total_seconds_left%60)}s"
        timer = time.perf_counter()
        while current_attmepts < default_attempts:
          try:
            response = model.generate_content(prompt)
            current_attmepts = default_attempts + 1
            is_quota_too_low = 0
            display_out_text.update(update_text(f"Currently running... #{z_progress}/{total_lines} | ETA [{eta}]"))
          except Exception as err:
            if is_quota_too_low > default_attempts * 3:
              raise Exception("Cancelling Process. Please either:\n - Increase [defualt_quota_sec] to allow for longer pauses between API calls\n - Request a Quota Increase for the API")
            display_out_text.update(update_text(f"#{z_progress} | LLM's need breaks too! Reattempting {current_attmepts+1}/{default_attempts}. Pausing for {defualt_quota_sec} seconds."))
            current_attmepts += 1
            is_quota_too_low += 1
            time.sleep(defualt_quota_sec)
        timer = time.perf_counter() - timer
        time_left_list[z_progress % runs_for_eta] = timer

        response_only_int = int(re.search(r'\d+', response.text).group())
        if response_only_int != 1:
          pairing = [query_json_line["_id"], context_json_line["_id"], response_only_int]
          all_pairs.append(pairing)
        display_out.update(progress(z_progress, total_lines))
        # This Line is only for testing
        # if (z_progress > 200):
        #   g.close()
        #   break
        # This line is only for Testing
    g.close()
  f.close()

  display_out.update(progress(1,1))
  print("Completed Deep Analysis of query <-> Context!")
  print(f"Length of all_pairs: {len(all_pairs)}")
  total_pairs = len(all_pairs)
  if (TRAINING_SPLIT + VALIDATION_SPLIT + TESTING_SPLIT != 1):
    raise Exception("TRAINING_SPLIT + VALIDATION_SPLIT + TESTING_SPLIT must equal 1")
  all_pairs = create_pairing_file(all_pairs, int(TRAINING_SPLIT * total_pairs), './output/training.tsv')
  all_pairs = create_pairing_file(all_pairs, int(VALIDATION_SPLIT * total_pairs), './output/validation.tsv')
  all_pairs = create_pairing_file(all_pairs, len(all_pairs), './output/testing.tsv')



def upload_output_to_bucket(gcs_files_bucket):
  gcs_bucket = f"gs://{PROJECT_ID}-output"
  client = storage.Client()
  try:
    client.get_bucket(gcs_bucket.split("/")[-1]).exists()
  except Exception as err:
    !gcloud storage buckets create {gcs_bucket} --location={LOCATION} --project={PROJECT_ID} --no-user-output-enabled
  uid = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
  gcs_output = f"/output__{gcs_files_bucket}__{uid}/"
  local_uri = f"./output/."
  !gsutil -q -m cp -r {local_uri} {gcs_bucket + gcs_output}
  print(f"Uploaded Output to GCS bucket: {gcs_bucket + gcs_output}")

For More details on what is needed for Fine Tuning, [learn more here!](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#dataset-format)

In [32]:
model = GenerativeModel("gemini-1.5-pro-preview-0514", generation_config = generation_config, safety_settings=safety_settings)

# Only Store PDFs in the Bucket
gcs_bucket = "ny-unemployment-laws" #"testing-finetuning" # Do not put any slashes after uri!
pdf_folder ="./downloaded_pdfs/" # Include a slash after the uri!

create_clean_folders(pdf_folder)
gcs_pdf_list = download_bucket_to_local(gcs_bucket, pdf_folder)

all_pdf_text = pdf_context_jsonl(pdf_folder, model)
pdf_query_jsonl(gcs_pdf_list, model)

timer = time.perf_counter()
create_pairing_tsv(all_pdf_text, model)
timer = time.perf_counter() - timer
rounded_minutes = round(timer/60)
rounded_hours = round(rounded_minutes/60)
print(f'Finished (DEEP ANALYSIS) of create_pairing_tsv in {rounded_hours} hours and {rounded_minutes} minutes.')
# <zthor> Add validator for TSV incase errors occured during execution.
upload_output_to_bucket(gcs_bucket)


Now proccessing (43) chunks through LLM...


Finished generating context.jsonl
Creating query.jsonl [May take a few minutes..]


Validate_jsonl_llm returned: True
Creating pairing.tsv via [DEEP ANALYSIS] [This will take a couple hours..]


KeyboardInterrupt: 

# Next steps

To learn more about how you can use the embeddings, check out the [examples](https://ai.google.dev/examples?keywords=embed) available. To learn how to use other services in the Gemini API, visit the [Python quickstart](https://ai.google.dev/gemini-api/docs/get-started/python).