In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Zachary Thorman](https://github.com/zthor5)|

# Overview

This notebook will be used to evaluate the performance of different combinations of finetuning, RAG, & models for evaluating their performance.

This Notebook currently shows the capabilities using Gemini 1.5 Pro with different combinations of embedding models, utilizing ChromaDB to create a RAG architecture & evaluate.


<img src="https://drive.google.com/uc?export=download&id=1LizTwffekG1RfvpaRYAiuNTRM_Q8duVg" width="70%">

# Getting Started

In this section, you will install needed dependencies & define the Google Cloud project where you want to connect to Vertex AI.

### Install dependencies

In [2]:
!pip install --upgrade --quiet google-generativeai chromadb pymupdf google-cloud-storage langchain==0.1.20

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.4/581.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.5/126.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

Then import the modules you'll use in this tutorial.

In [3]:
import textwrap, chromadb, random, re, time, datetime, json, os
import numpy as np
import pandas as pd
import pymupdf

from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from vertexai.language_models import TextEmbeddingModel

# Used to securely store your API key
from google.colab import userdata
from google.cloud import storage

from IPython.display import Markdown, HTML, display
from chromadb import Documents, EmbeddingFunction, Embeddings

# Import LangChain components
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader


# Initialize Vertex AI
import vertexai
import sys

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [4]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [5]:
# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

### Define Google Cloud project information, initialize Vertex AI, and add Secrets

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [6]:
# Utilizing Secrets to retrieve sensitive information
# You can add your own projectID and location to run in your environment.

PROJECT_ID = userdata.get('ProjectId') # @param {type:"string"}
LOCATION = "us-central1"    # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Declaring Helper Class for Embedding's and LLM's

### Helper Functions

In [7]:
def create_clean_folders(PDF_Path):
  # Create the directory if it doesn't exist
  if not os.path.exists(PDF_Path):
    os.makedirs(PDF_Path)
  pdf_star = PDF_Path + "*"
  !rm -rf {pdf_star}

  if not os.path.exists("./output/"):
    os.makedirs("./output/")
  pdf_star = "./output/" + "*"
  !rm -rf {pdf_star}

def update_text(text = "default text"):
    return HTML("""
        <p>{}</p>
    """.format(text))

def progress(value =1, max =1):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 60%'>
            {value}
        </progress>
    """.format(value=value, max=max))


def download_bucket_to_local(bucket_uri, local_folder):
  gcs_uri_list = []
  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_uri)
  blobs = bucket.list_blobs()
  for blob in blobs:
    file_path = local_folder + blob.name
    blob.download_to_filename(file_path)
    gcs_uri_list.append("gs://" + bucket_uri + "/" + blob.name)
    print(f"Downloaded: {blob.name}")
  return gcs_uri_list

def update_text(text = "default text"):
    return HTML("""
        <p>{}</p>
    """.format(text))

def progress(value =1, max =1):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 60%'>
            {value}
        </progress>
    """.format(value=value, max=max))

## Class for Generative Model

In [8]:
class GenModel:
    global generation_config
    generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0,
    "top_p": 0.95}

    def __init__(self, model_name = "gemini-1.5-pro-001", temperature = 1.0):
        # Add a switch case to choose how to initialize the model <zthor>
        self.model = GenerativeModel(model_name, generation_config = generation_config)

        self.model_name = model_name # To be used later for picking models via a Match
        self.temperature = temperature # To be used later for picking models via a Match

    def get_answer(self, prompt):
        match self.model_name:
          case "gemini-1.5-pro-001":
            print("Gemini!")
            response = self.model.generate_content(prompt)
            return response.text
          case "gemma":
            print("Gemma!")
            return "To be implemented"
          case _:
            print("default fun!")


In [9]:
tester = GenModel("gemini-1.5-pro-001")
Markdown(tester.get_answer("Use 5 words to answer this question: What is the meaning of life? "))

Gemini!


To find your own meaning. 


## Class for Plain Embedding Model

In [None]:
class Vanilla_Embedding_Model(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings: # https://github.com/chroma-core/chroma/issues/1496
      embeddings = []
      for doc in input:
        vector = self.embed_model.get_embeddings([doc])
        embeddings.append(vector[0].values)
      return embeddings

    def __init__(self, model_name= "text-embedding-004"):
        self.embed_model = TextEmbeddingModel.from_pretrained(model_name)

In [None]:
tester_embed = Vanilla_Embedding_Model("text-embedding-004")
print(tester_embed(["To find purpose, meaning, and connection in our existence."]))

# Aiko writing the Finetuning Embeddings Class
- [Similar Notebook](https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/tuned_text-embeddings.ipynb#scrollTo=kIJC46m7SXvW)
- [GCP Guide for Vertex Embeddings](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings)

- [This notebook does almost exactly what you need to do for this section](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro_embeddings_tuning.ipynb)

In [None]:
# # Do not worry about this format, You can change your code to work like this later
# # Be initialized via a method, convert to embeddings via a method, and call the model endpoint via a method

# class FineTuned_Embedding_Model:
#     def __init__(self, model_name, local_folder_location_of_jsonl_and_tsv):
#         self.embed_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
#         # Make sure to have JSONLs & TSV files for finetuning here
#         # Create the finetuned model, using google credentials from earlier

#     def get_embeddings(self, text) -> list:

#         embedding = self.embed_model.get_embeddings([text])
#         vector = embedding[0].values
#         return vector
#         # Change to utilize the Embeddings Endpoint
#         # Will return a vector of embeddings

#     def call_model_endpoint():
#       # Call your Finetuned model from Vertex
#       loader = DataFrameLoader(df, page_content_column="page_content")


# ChromaDB Helper

Key Point: Next, you will choose a model. Any embedding model will work for this tutorial, but for real applications it's important to choose a specific model and stick with it. The outputs of different models are not compatible with each other.

In [None]:
class chroma_db():
  def __call__(self, input: Documents) -> Embeddings: # https://github.com/chroma-core/chroma/issues/1496
    embeddings = []
    for doc in input:
      vector = self.embed_model.get_embeddings([doc])
      embeddings.append(vector[0].values)
    return embeddings

  def __init__(self, name):
    self.name = name
    self.chunk_id = 0
    self.client = chromadb.Client()

  def create_collection(self, name, embedding_function, metadata):
    self.collection_name = name
    self.embedding_function = embedding_function
    self.collection = self.client.create_collection(
        name=name,
        embedding_function=embedding_function,
        metadata=metadata)

  def add_pdfs(self, local_pdf_folder):
    pdf_string_list = []
    for pdf in os.listdir(local_pdf_folder):
      pdf_text = ""
      with pymupdf.open(local_pdf_folder + pdf) as doc:
        for page in doc:
          pdf_text += page.get_text()
        pdf_string_list.append (pdf_text)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=200)

    display_out = display(update_text("Start Chunking..."), display_id=True)
    for pdf_text in pdf_string_list:
      chunks = text_splitter.split_text(pdf_text)
      for chunk in chunks:
        self.chunk_id+= 1
        display_out.update(update_text(f"Adding Chunk: {self.chunk_id}"))
        self.collection.add(documents= [chunk], ids=f"chunk_{self.chunk_id}")
    display_out.update(update_text(f"All Chunks loaded!"))


## Testing Chroma & Embeddings

In [None]:
# TESTING THE CODE

# Only Store PDFs in the Bucket
gcs_bucket = "dmv-pdf-analysis" # Do not put any slashes after uri!
pdf_folder ="./downloaded_pdfs/" # Include a slash after the uri!

create_clean_folders(pdf_folder)
gcs_pdf_list = download_bucket_to_local(gcs_bucket, pdf_folder)


In [None]:
chroma = chroma_db("My name is.. chroma DB client!")

chroma.create_collection("chroma_testing", Vanilla_Embedding_Model(), {"hnsw:space": "cosine"})

chroma.add_pdfs(pdf_folder)

In [None]:
print(chroma.collection.get())
print(chroma.collection.peek())
print(chroma.collection.count())

query_relevant_passage = chroma.collection.query(query_texts=["What do you have to do a a kid to get your license?"], n_results=5)
print(query_relevant_passage)
print(query_relevant_passage['documents'][0][0])

In [None]:
# chroma.client.delete_collection("chroma_testing")

# Evaluation

In [None]:
# Fill Sandwich choices for testing

# Next steps

To learn more about how you can use the embeddings, check out the [examples](https://ai.google.dev/examples?keywords=embed) available. To learn how to use other services in the Gemini API, visit the [Python quickstart](https://ai.google.dev/gemini-api/docs/get-started/python).