In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Zachary Thorman](https://github.com/zthor5)|

# Overview

This Notebook will generate JSONLs & Training splits for Finetuning from a list of PDF's using Generative AI's full context to do analysis of the PDFs.

# Getting Started

In this section, you will install needed dependencies & define the Google Cloud project where you want to connect to Vertex AI.

### Install dependencies

In [26]:
!pip install --upgrade --quiet google-generativeai chromadb pymupdf langchain==0.1.20

Then import the modules you'll use in this tutorial.

In [27]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
import pymupdf

from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from vertexai.language_models import TextEmbeddingModel

# Used to securely store your API key
from google.colab import userdata

from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings

# Import LangChain components
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader


# Initialize Vertex AI
import vertexai
import sys

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [28]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [29]:
# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

### Define Google Cloud project information, initialize Vertex AI, and add Secrets

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [30]:
# Utilizing Secrets to retrieve sensitive information
# You can add your own projectID and location to run in your environment.

PROJECT_ID = userdata.get('ProjectId') # @param {type:"string"}
LOCATION = "us-central1"    # @param {type:"string"}


vertexai.init(project=PROJECT_ID, location=LOCATION)

# Class for Flexible LLM Caller

## Generate JSONLs



In [31]:
global generation_config
generation_config = {
"max_output_tokens": 8192,
"temperature": 0,
"top_p": 0.95}

# class GenModel:

#     def __init__(self, model_name, temperature):
#         self.model = GenerativeModel("gemini-1.5-pro-001", generation_config = generation_config)
#         self.model_name = model_name # To be used later for picking models via a Match
#         self.temperature = temperature # To be used later for picking models via a Match

#     def get_answer(self, prompt):
#         response = self.model.generate_content(prompt)
#         return response.text


# class Vanilla_Embedding_Model:

#     def __init__(self, model_name, temperature):
#         self.embed_model = TextEmbeddingModel.from_pretrained("text-embedding-004")

#     def get_answer(self, prompt):
#         embedding = self.embed_model.get_embeddings([prompt])
#         vector = embedding[0].values
#         return vector

## Test Generating JSONLs

Note: *Markdown is currently lost in this conversion.*

In [32]:
! wget -q -O ga_driver_manual.pdf 'https://drive.google.com/uc?export=download&id=1JZEGVb8rUqZ_be9LfizDKLzPz1xQElGR'

# Create a text splitter to divide documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=200,
)

In [45]:
def pdf_context_jsonl(pdfList, model):
  text = ""
  for pdf in pdfList:
    with pymupdf.open(pdf) as doc:
      for page in doc:
          text += page.get_text()

  # Split the text into chunks
  print(len(text))
  chunks = text_splitter.split_text(text)
  print(f'Number of chunks: {len(chunks)}')
  with open('context.jsonl', 'w') as f:
      for i, chunk in enumerate(chunks):
        chunk = chunk.replace("\n"," ")
        chunk = chunk.replace("\"","\'")
        title = model.generate_content(f"Generate a 10 word summary of this text: {chunk}")
        f.write(f'{{"_id":"context_{i}","title":"{title}","text":"{chunk}"}}\n')

def validate_jsonl_llm(jsonl_text, model):
  prompt_analysis = f"Return 'true' or 'false' based on if the following text is properly formatted and valid JSONL:\n{jsonl_text}"
  response = model.generate_content(prompt_analysis)
  return True if ("true" in response.text.lower()) else False

# Creates JSONL Prompts for a PDF and writes them into a file
# *zthor* Modify Prompt to generate reliably at least 10 to 50 Per PDF
def pdf_query_jsonl(pdfList, model):
  validate_text = ""
  prompt = 'Output JSONL questions based on the pdf provided  with the context needed to answer each question in this format, filling in the brackets: {"_id":"query_[An Iterable number]","title":"[The iterable number from before times 2]","text":"[A question based on the pdf provided]", "context":"[Cite the answer from the PDF]"}'
  for pdf in pdfList:
    pdf_file = Part.from_uri(pdf, mime_type="application/pdf")
    with open('query.jsonl', 'w') as f:
      output = model.generate_content([prompt,pdf_file])
      f.write(output.text)
      validate_text += output.text

  if (validate_jsonl_llm(model,validate_text)):
    f.close()
    return "Successful Creation of Prompt.jsonl"
  else:
    if 'yes' in input('Failed Creation of Prompt.jsonl. Reattempt creation?: ').lower():
      f.close()
      pdf_query_jsonl(pdfList, model)
    else:
      f.close()
      return "Failed Creation of Prompt.jsonl"

def create_pairing_tsv(pdfList, model):
  with open('query.jsonl', 'r') as f:



In [44]:
model = GenerativeModel("gemini-1.5-pro-preview-0514", generation_config = generation_config)

pdfList =  ["gs://dmv-pdf-analysis/driver_manual_ga_2024.pdf",
            "gs://dmv-pdf-analysis/commercial_driver_guide.pdf",
            "gs://dmv-pdf-analysis/motorcycle_operator_guide.pdf",
            "gs://dmv-pdf-analysis/40_hour_teen_driving_guide.pdf",
            "gs://dmv-pdf-analysis/alcohol_drug_awareness_student.pdf",
] # Needs to be stored in a GCS Bucket



pdf_context_jsonl(pdfList, model)
print(pdf_query_jsonl(pdfList, model))


AttributeError: 'str' object has no attribute 'append'

# Next steps

To learn more about how you can use the embeddings, check out the [examples](https://ai.google.dev/examples?keywords=embed) available. To learn how to use other services in the Gemini API, visit the [Python quickstart](https://ai.google.dev/gemini-api/docs/get-started/python).