In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Multimodal Retrieval Augmented Generation (RAG) using Vertex AI Gemini API

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_QnA_usecase.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fretrieval-augmented-generation%2Fmultimodal_rag_QnA_usecase.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_QnA_usecase.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_QnA_usecase.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

| | |
|-|-|
|Author(s) | [Aakash Gouda](https://github.com/aksstar) , [Bhushan Garware](https://github.com/BhushanGarware)|

## Overview

Retrieval augmented generation (RAG) has become a popular paradigm for enabling LLMs to access external data and also as a mechanism for grounding to mitigate against hallucinations.

In this notebook, you will learn how to perform RAG where you will perform Q&A over a document filled with both text and images.

### Gemini

Gemini is a family of generative AI models developed by Google DeepMind that is designed for multimodal use cases. The Gemini API gives you access to the Gemini 1.0 Pro Vision and Gemini 1.0 Pro models.

### Comparing text-based and multimodal RAG

Multimodal RAG offers several advantages over text-based RAG:

1. **Enhanced knowledge access:** Multimodal RAG can access and process both textual and visual information, providing a richer and more comprehensive knowledge base for the LLM.
2. **Improved reasoning capabilities:** By incorporating visual cues, multimodal RAG can make better informed inferences across different types of data modalities.

This notebook shows you how to use multimodal RAG with Vertex AI Gemini API, [text embeddings](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings) to build a question answering system for a PDF document.


### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Objectives

This notebook provides a guide to building a questions answering system using multimodal retrieval augmented generation (RAG).

You will complete the following tasks:

1. Extract data from documents containing both text and images using Gemini Vision Pro, and generate embeddings of the data, store it in vector store
2. Search the vector store with text queries to find similar text data
3. Using Text data as context, generate answer to the user query using Gemini Pro Model.

## Getting Started

### Install libraries


In [None]:
!pip install --upgrade --quiet pymupdf langchain gradio google-cloud-aiplatform

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
import sys
# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Define Google Cloud project information and initialize Vertex AI

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Define project information
# PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
PROJECT_ID = "aakash-test-env"
LOCATION = "us-central1"  # @param {type:"string"}


# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
# File system operations and displaying images
import os
# Libraries for downloading files, data manipulation, and creating a user interface
import urllib.request

import gradio as gr
import pandas as pd
# Initialize Vertex AI libraries for working with generative models
from vertexai.generative_models import GenerativeModel, Image

# Create a "utils" directory if it doesn't exist
if not os.path.exists("utils"):
    os.makedirs("utils")

# Download utility files from a GitHub repository
url_prefix = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/use-cases/document-qa/utils"
files = ["__init__.py", "matching_engine.py", "matching_engine_utils.py"]

for fname in files:
    urllib.request.urlretrieve(f"{url_prefix}/{fname}", filename=f"utils/{fname}")

# Import utility functions for timing and file handling
import time

import fitz
# Print Vertex AI SDK version
from google.cloud import aiplatform
from PIL import Image as PIL_Image

print(f"Vertex AI SDK version: {aiplatform.__version__}")

# Import LangChain components
import langchain

print(f"LangChain version: {langchain.__version__}")
from langchain.chains import RetrievalQA
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
# Import custom vector search packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

### Initalizing Gemini Vision Pro and Text Embedding models

In [None]:
# Loading Gemini Model
multimodal_model = GenerativeModel("gemini-1.0-pro-vision")

# Initalizing embedding model
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")

### Download a sample PDF file from internet [Skip this step if you have uploaded your PDF file]

In [None]:
!wget https://www.hitachi.com/rev/archive/2023/r2023_04/pdf/04a02.pdf

### Split PDF pages to Images

In [None]:
# Run the following code for each file
PDF_FILENAME = "04a02.pdf"  # <-- Replace with your filename

In [None]:
# Create an "Images" directory if it doesn't exist
Image_Path = "./Images/"
if not os.path.exists(Image_Path):
    os.makedirs(Image_Path)

# To get better resolution
zoom_x = 2.0  # horizontal zoom
zoom_y = 2.0  # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

doc = fitz.open(PDF_FILENAME)  # open document
for page in doc:  # iterate through the pages
    pix = page.get_pixmap(matrix=mat)  # render page to an image
    outpath = f"./Images/{PDF_FILENAME}_{page.number}.jpg"
    pix.save(outpath)  # store image as a PNG

This module processes a set of images, extracting text and tabular data using a multimodal model (Gemini Vision Pro).
It handles potential errors, stores the extracted information in a DataFrame, and saves the results to a CSV file.

In [None]:
# Define the path where images are located
image_names = os.listdir(Image_Path)
Max_images = len(image_names)

# Create empty lists to store image information
page_source = []
page_content = []
page_id = []

p_id = 0  # Initialize image ID counter
rest_count = 0  # Initialize counter for error handling

while p_id < Max_images:
    try:
        # Construct the full path to the current image
        image_path = Image_Path + image_names[p_id]

        # Load the image
        image = Image.load_from_file(image_path)

        # Generate prompts for text and table extraction
        prompt_text = "Extract all text content in the image"
        prompt_table = (
            "Detect table in this image. Extract content maintaining the structure"
        )

        # Extract text using your multimodal model
        contents = [image, prompt_text]
        response = multimodal_model.generate_content(contents)
        text_content = response.text

        # Extract table using your multimodal model
        contents = [image, prompt_table]
        response = multimodal_model.generate_content(contents)
        table_content = response.text

        # Log progress and store results
        print(f"processed image no: {p_id}")
        page_source.append(image_path)
        page_content.append(text_content + "\n" + table_content)
        page_id.append(p_id)
        p_id += 1

    except Exception as err:
        # Handle errors during processing
        print(err)
        print("Taking Some Rest")
        time.sleep(1)  # Pause execution for 1 second
        rest_count += 1
        if rest_count == 5:  # Limit consecutive error handling
            rest_count = 0
            print(f"Can not process image no: {image_path}")
            p_id += 1  # Move to the next image

# Create a DataFrame to store extracted information
df = pd.DataFrame(
    {"page_id": page_id, "page_source": page_source, "page_content": page_content}
)
del page_id, page_source, page_content  # Conserve memory
print(df.head())  # Preview the DataFrame

### Creating Vertex AI: Vector Search
The code configures and deploys a vector search index on Google Cloud, making it ready to store and search through embeddings.

In [None]:
VECTOR_SEARCH_REGION = "us-central1"
VECTOR_SEARCH_INDEX_NAME = f"{PROJECT_ID}-vector-search-index-ht"
VECTOR_SEARCH_EMBEDDING_DIR = f"{PROJECT_ID}-vector-search-bucket-ht"
VECTOR_SEARCH_DIMENSIONS = 768

In [None]:
# Creates a GCS bucket for storing vector search embedding
! set -x && gsutil mb -p $PROJECT_ID -l us-central1 gs://$VECTOR_SEARCH_EMBEDDING_DIR

In [None]:
# Initalizing vector search from utils
vector_search = MatchingEngineUtils(
    PROJECT_ID, VECTOR_SEARCH_REGION, VECTOR_SEARCH_INDEX_NAME
)

In [None]:
# Create and Deploy Vector Search Index
index = vector_search.create_index(
    embedding_gcs_uri=f"gs://{VECTOR_SEARCH_EMBEDDING_DIR}/init_index",
    dimensions=VECTOR_SEARCH_DIMENSIONS,
    index_update_method="streaming",
    index_algorithm="tree-ah",
)
if index:
    print(index.name)

index_endpoint = vector_search.deploy_index()
if index_endpoint:
    print(f"Index endpoint resource name: {index_endpoint.name}")
    print(
        f"Index endpoint public domain name: {index_endpoint.public_endpoint_domain_name}"
    )
    print("Deployed indexes on the index endpoint:")
    for d in index_endpoint.deployed_indexes:
        print(f"    {d.id}")

This code snippet prepares textual data for storage in a vector search engine. The end goal is to enable efficient similarity-based search queries against this textual data.

In [None]:
# Create a DataFrameLoader to prepare data for LangChain
loader = DataFrameLoader(df, page_content_column="page_content")

# Load documents from the 'page_content' column of your DataFrame
documents = loader.load()

# Log the number of documents loaded
print(f"# of documents loaded (pre-chunking) = {len(documents)}")

# Create a text splitter to divide documents into smaller chunks
text_splitter = CharacterTextSplitter(
    chunk_size=10000,  # Target size of approximately 10000 characters per chunk
    chunk_overlap=0,  # No overlap between chunks
)

# Split the loaded documents
doc_splits = text_splitter.split_documents(documents)

# Add a 'chunk' ID to each document split's metadata for tracking
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

# Log the number of documents after splitting
print(f"# of documents = {len(doc_splits)}")


# Retrieve Vector Search index and endpoint IDs (a helper function 'vector_search.get_index_and_endpoint()')
(
    VECTOR_SEARCH_INDEX_ID,
    VECTOR_SEARCH_ENDPOINT_ID,
) = vector_search.get_index_and_endpoint()

# Print the retrieved IDs for reference
print(f"VECTOR_SEARCH_INDEX_ID={VECTOR_SEARCH_INDEX_ID}")
print(f"VECTOR_SEARCH_INDEX_ENDPOINT_ID={VECTOR_SEARCH_ENDPOINT_ID}")

# Initialize a Vector Search object for managing your vector search system
vector_store_object = MatchingEngine.from_components(
    project_id=PROJECT_ID,  # Your Google Cloud Project ID
    region=VECTOR_SEARCH_REGION,  # Region where your vector search index is located
    gcs_bucket_name=f"gs://{VECTOR_SEARCH_EMBEDDING_DIR}".split("/")[
        2
    ],  # GCS bucket storing embeddings
    embedding=embeddings,  # Your embedding model
    index_id=VECTOR_SEARCH_INDEX_ID,
    endpoint_id=VECTOR_SEARCH_ENDPOINT_ID,
)

# Prepare lists of text content and metadata
texts = [doc.page_content for doc in doc_splits]
metadatas = [
    [
        {
            "namespace": "source",
            "allow_list": [doc.metadata["page_source"]],
        },  # Metadata to track the original source
        {
            "namespace": "chunk",
            "allow_list": [str(doc.metadata["chunk"])],
        },  # Metadata to track the chunk number
    ]
    for doc in doc_splits
]

# Embed the text chunks and store them in the vector search index
doc_ids = vector_store_object.add_texts(texts=texts, metadatas=metadatas)

### Ask Questions to the PDF
This code snippet establishes a question-answering (QA) system.  It leverages a vector search engine to find relevant information from a dataset and then uses the 'gemini-pro' LLM model to generate and refine the final answer to a user's query.

In [None]:
# LLM model initalization for Retrieval Chain
gemini_pro_model = VertexAI(
    model_name="gemini-pro",  # Name of the LLM model on Vertex AI
    max_output_tokens=1024,  # Limits the maximum response length
    temperature=0,  # Controls randomness (lower = more deterministic)
    top_p=0.8,
    top_k=40,  # Sampling parameters for text generation
    verbose=True,  # Enables logging
)


def Test_LLM_Response(txt):
    """
    Determines whether a given text response generated by an LLM indicates a lack of information.

    Args:
        txt (str): The text response generated by the LLM.

    Returns:
        bool: True if the LLM's response suggests it was able to generate a meaningful answer,
              False if the response indicates it could not find relevant information.

    This function works by presenting a formatted classification prompt to the LLM (`gemini_pro_model`).
    The prompt includes the original text and specific categories indicating whether sufficient information was available.
    The function analyzes the LLM's classification output to make the determination.
    """

    classification_prompt = f""" Classify the text as one of the following categories:
        -Information Present
        -Infromation Not Present
        Text=The provided context does not contain information.
        Category:Infromation Not Present
        Text=I cannot answer this question from the provided context.
        Category:Infromation Not Present
        Text:{txt}
        Category:"""

    if gemini_pro_model(classification_prompt) == " Information Not Present":
        return False  # Indicates that the LLM couldn't provide an answer
    else:
        return True  # Suggests the LLM generated a meaningful response


def get_answer(query):
    """
    Retrieves an answer to a provided query using a vector search system and an LLM.

    Args:
        query (str): The user's query.

    Returns:
        dict: A dictionary containing the LLM-generated answer ('result' key) and potentially other relevant metadata.

    This function coordinates the following steps:

    1. **Document Retrieval:** Fetches relevant documents from a vector search system (`vector_store_object`) based on similarity to the query.
    2. **Answer Generation:** Leverages an LLM (`gemini_pro_model`) within a LangChain RetrievalQA chain to generate an answer from the retrieved documents.
    3. **Quality Check:**  Employs the `Test_LLM_Response` function as a filter to assess the quality of the generated answer.
    4. **Iteration:**  Iteratively increases the number of retrieved documents (`k`) if the answer quality is insufficient, up to a maximum of 5 attempts.
    5. **Error Handling:** Handles potential exceptions, aiming to return the best available result.
    """
    k = 1
    flag = 0
    while flag == 0:
        try:
            # Configure retriever using your vector search system
            retriever = vector_store_object.as_retriever(
                search_type="similarity", search_kwargs={"k": k}
            )
            # Setup a RetrievalQA chain (fetch documents + use LLM)
            qa = RetrievalQA.from_chain_type(
                llm=gemini_pro_model,
                chain_type="stuff",
                retriever=retriever,
                return_source_documents=True,
            )
            result = qa({"query": query})
            txt = result["result"]
            if not Test_LLM_Response(txt):
                k = k + 1  # Increase the number of retrieved documents if needed
            else:
                flag = 1  # Exit loop when getting a valid response
            if k == 5:
                flag = 1  # Limit attempts
        except:
            k = k - 1
            if k == 0:
                return result  # Return the best available result on error
    return result


question = (
    "what is the 5th step of Transformer Manufacturing Flow ?"  # @param {type:"string"}
)
print(get_answer(question))

# Ask Questions to the PDF using Gradio UI
 this code creates a web-based frontend for your question-answering system, allowing users to easily enter queries and see the results along with relevant images.

In [None]:
def gradio_query(query):

    # Retrieve the answer from your QA system
    result = get_answer(query)
    answer = result["result"]

    try:
        # Attempt to fetch the source image reference
        ref = result["source_documents"][-1].metadata["source"]
        image = PIL_Image.open(ref)  # Open the reference image
    except:
        # Use a default image if the reference is not found [PLEASE ADD A DEFAULT IMAGE]
        image = PIL_Image.open("./Images/blank.jpg")

    return [answer, image]  # Return both the text answer and the image


gr.close_all()  # Ensure a clean Gradio interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            # Input / Output Components
            query = gr.Textbox(label="Query", info="Enter your query")
            btn_enter = gr.Button("Process")
            answer = gr.Textbox(label="Response", info="Enter your query")
            btn_clear = gr.Button("Clear")
        with gr.Column():
            image = gr.Image(label="Refrence", visible=True)

    # Button Click Event
    btn_enter.click(fn=gradio_query, inputs=query, outputs=[answer, image])

demo.launch(share=True, debug=True)  # Launch the Gradio app

### Cleaning up
To clean up all Google Cloud resources used in this project, you can delete the Google Cloud project you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
# Deleting vector search index and endpoint
vector_search.delete_index_endpoint()
vector_search.delete_index()