In [1]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Started with the Vertex AI Gemini API & Python SDK

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/language/rag_evaluation/deepeval_with_gemini.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/rag_evaluation/deepeval_with_gemini.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/language/rag_evaluation/deepeval_with_gemini.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>


## Overview

### [DeepEval](https://docs.confident-ai.com/docs/getting-started)

DeepEval is an open-source evaluation framework for LLMs. DeepEval makes it extremely easy to build and iterate on LLM (applications) and was built with the following principles in mind:

- Easily "unit test" LLM outputs in a similar way to Pytest.
- Plug-and-use 14+ LLM-evaluated metrics, most with research backing.
- Synthetic dataset generation with state-of-the-art evolution techniques.
- Metrics are simple to customize and covers all use cases.
- Real-time evaluations in production.

### Gemini

Gemini is a family of generative AI models developed by Google DeepMind that is designed for multimodal use cases. The Gemini API gives you access to the Gemini Pro Vision and Gemini Pro models.

### Vertex AI Gemini API

The Vertex AI Gemini API provides a unified interface for interacting with Gemini models. There are currently two models available in the Gemini API:

- **Gemini Pro model** (`gemini-pro`): Designed to handle natural language tasks, multiturn text and code chat, and code generation.
- **Gemini Pro Vision model** (`gemini-pro-vision`): Supports multimodal prompts. You can include text, images, and video in your prompt requests and get text or code responses.

You can interact with the Gemini API using the following methods:

- Use the [Vertex AI Studio](https://cloud.google.com/generative-ai-studio) for quick testing and command generation
- Use cURL commands
- Use the Vertex AI SDK

This notebook focuses on using the **Gemini model with DeepEval**

For more information, see the [Generative AI on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) documentation.


### Objectives

In this notebook we will focus on using the Vertex AI Gemini API with RAGAS
We will use the Gemini Pro (`gemini-pro`) model for Q&A evaluation.

You will complete the following tasks:

- Install the Vertex AI SDK for Python
- Use the Vertex AI Gemini API to interact with each model
  - Gemini Pro (`gemini-pro`) model:
    - Q&A Generation
    - Evaluate Q&A performance with RAGAS

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


### Install Vertex AI SDK for Python


In [None]:
#This notebook was created and tested with below versions 
! pip3 install --user install deepeval==0.21.51 \
datasets==2.18.0 \
langchain==0.1.14 \
langchain-google-vertexai==1.0.5 \
langchain-chroma==0.1.1 \
chromadb==0.5.0 \
pypdf==4.2.0 \

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, it is recommended to restart the runtime. Run the following cell to restart the current kernel.

The restart process might take a minute or so.


In [2]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

: 

After the restart is complete, continue to the next step.


<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>


## Import libraries


In [1]:
import itertools
import pytest

import pandas as pd
import vertexai

from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Langchain package for Vertex AI
from langchain_google_vertexai import ( # type: ignore[import-untyped]
    ChatVertexAI,
    VertexAIEmbeddings,
    HarmBlockThreshold,
    HarmCategory
)

# Base LLM for Deepeval
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval import (
    assert_test, 
    evaluate
)
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

In [2]:
# TODO(developer): Update the below lines
PROJECT_ID = "<your_project"
LOCATION = "your_region"

PROJECT_ID = "vertexai-pgt"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID
              , location=LOCATION)

## Use Vertex AI models

The [Gemini 1.5 Pro](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/overview#text-use-cases) models are designed to handle natural language tasks, multiturn text and code chat, and code generation.


In [34]:
# Initilialize safety filters for vertex model
safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

generation_config = {'temperature' : 0.0 
                     , 'topk' : 1
                     }

# Initialise the ChatVertexAI model
custom_chat_model_gemini = ChatVertexAI(
    model_name="gemini-1.0-pro-002"
    , safety_settings=safety_settings
    , generation_config=generation_config
    , project= PROJECT_ID
    , location= LOCATION
    , response_validation=False #Important since deepval cannot handle validation errors
)

The [Vertex AI Embeddings](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings) models are designed to convert text to dense vector representations

In [35]:
#Load Embeddings Models
embeddings = VertexAIEmbeddings(model_name = "textembedding-gecko@003")

## Create a local Vector DB
### Load the document

In [5]:
#source document
document_uri = "https://arxiv.org/pdf/1706.03762"

In [6]:
#use PyPDF loade to read and chunk the input document
loader = PyPDFLoader(document_uri)
docs = loader.load_and_split()

#Verify if pages are loaded correctly
docs[0]

Document(page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing w

### Create local Vector DB

In [11]:
# Create an in-memory Vector DB using Chroma
vectordb = Chroma.from_documents(docs, embeddings)

In [10]:
#Set Vector DB as retriever
retriever = vectordb.as_retriever()

### Create Q&A Chain

In [12]:
#Create Q&A template for the Gemini Model
template = """You task is to answer questions related documents.
Use the following context to answer the question at the end.
{context}

Answers should be crisp.

Question: {question}
Helpful Answer:"""

#Create a prompt template for the q&a chain
PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
)

#Pass prompts to q&a chain
chain_type_kwargs = {"prompt": PROMPT}

#Retreiver arguments
retriever.search_kwargs = {"k": 1}

In [13]:
#Setup a RetrievalQA Chain
qa = RetrievalQA.from_chain_type(
    llm=custom_chat_model_gemini
    , chain_type="stuff"
    , retriever=retriever
    , return_source_documents=True
    , chain_type_kwargs=chain_type_kwargs
    )

In [14]:
#Test the chain with a sample question
query = "Who are the authors of paper on Attention is all you need"
result = qa({"query": query})
result

  warn_deprecated(


{'query': 'Who are the authors of paper on Attention is all you need',
 'result': 'Vaswani et. al. are the authors of the paper on "Attention is All You Need". The authors\' full names are listed along with their affiliations in the paper.\n',
 'source_documents': [Document(page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional n

## Evaluation
### Create the evalation set

In [49]:
#Evaluation set with questions and ground_truth
questions = ["What architecture is proposed in paper titled Attention is all you need?",
             "Where do primary authors of paper titled Attention is all you need work?"
            ]
ground_truth = ["Transformers architecture",
                "Google Brain"
              ]

### Run the [Q&A chain](#create-qa-chain) on evaluation dataset 

In [50]:
contexts = []
answers = []

#Generate contexts and answers for each question
for query in questions:
  result = qa({"query": query})
  contexts.append([document.page_content for document in result.get('source_documents')])
  answers.append(result.get('result'))

In [51]:
# Convert into a dataset and prepare for consumption by DeepEval API
dataset = []
for (q, a, c , g) in itertools.zip_longest(questions, answers, contexts , ground_truth):
    dataset.append({'Question' : q 
                    , 'Answer' : g 
                    , "Context" : c })

#Inspect the dataset  
dataset

[{'Question': 'What architecture is proposed in paper titled Attention is all you need?',
  'Answer': 'Transformers architecture',
  'Context': ['Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N= 6 identical layers. Each layer has two\nsub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\nwise fully connected feed-forward network. We employ a residual connection [ 11] around each of\nthe two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is\nLayerNorm( x+ Sublayer( x)), where Sublayer( x)is the function implemented by the sub-layer\nitself. To facilitate these residual connections, all sub-layers in the m

## IMPORTANT : Gemini with DeepEval
> DeepEval is designed to work with OpenAI Models by default. We must wite a Wrapper to make it work with Gemini

In [52]:
# Base LLM for Deepeval

class GoogleVertexAIDeepEval(DeepEvalBaseLLM):
    """ Class to implement Vertex AI for DeepEval"""
    def __init__(self, model): # pylint: disable=W0231
        self.model = model

    def load_model(self): # pylint: disable=W0221
        return self.model

    def generate(self, prompt: str) -> str: # pylint: disable=W0221
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str: # pylint: disable=W0221
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self): # pylint: disable=W0236 , W0221
        return "Vertex AI Model"

In [53]:
# initiatialize the Deepeval wrapper class
google_vertexai_gemini_deepeval = GoogleVertexAIDeepEval(model=custom_chat_model_gemini)

### Run the DeepEval Evaluation

In [55]:
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5 
                                                    , model=google_vertexai_gemini_deepeval
                                                    , async_mode= False
                                                    )
test_cases = []
for record in dataset:
    test_cases.append(LLMTestCase(
        input=record['Question'],
        actual_output=record['Answer'],
        retrieval_context=record['Context']
    )
    )

In [57]:
# Evaluate test cases in bulk
evaluate(test_cases, [answer_relevancy_metric])

Output()

Evaluating test cases...
Event loop is already running. Applying nest_asyncio patch to allow async execution...


Output()





Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Vertex AI Model, reason: The answer relevancy score is 1.00. This is the highest possible score, indicating that the response is perfectly relevant to the input. There are no irrelevant statements in the response. 

This is a great example of a concise and informative response. It accurately reflects the content of the input and provides a clear and helpful answer. 
, error: None)

For test case:

  - input: What architecture is proposed in paper titled Attention is all you need?
  - actual output: Transformers architecture
  - expected output: None
  - context: None
  - retrieval context: ['Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\



[TestResult(success=True, metrics=[<deepeval.metrics.answer_relevancy.answer_relevancy.AnswerRelevancyMetric object at 0x31902c2d0>], input='What architecture is proposed in paper titled Attention is all you need?', actual_output='Transformers architecture', expected_output=None, context=None, retrieval_context=['Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N= 6 identical layers. Each layer has two\nsub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\nwise fully connected feed-forward network. We employ a residual connection [ 11] around each of\nthe two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer i

In [56]:
#measure single instance
answer_relevancy_metric.measure(test_cases[0])

Output()

1.0

### To use DeepEval with Pytest with 

In [58]:
%%writefile ./scripts/vertex_llm.py

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Custom Class implementation"""

# Langchain package for Vertex AI
from langchain_google_vertexai import ( # type: ignore[import-untyped]
    ChatVertexAI,
    HarmBlockThreshold,
    HarmCategory
)

# Base LLM for Deepeval
from deepeval.models.base_model import DeepEvalBaseLLM

class GoogleVertexAIDeepEval(DeepEvalBaseLLM):
    """ Class to implement Vertex AI for DeepEval"""
    def __init__(self, model): # pylint: disable=W0231
        self.model = model

    def load_model(self): # pylint: disable=W0221
        return self.model

    def generate(self, prompt: str) -> str: # pylint: disable=W0221
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str: # pylint: disable=W0221
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self): # pylint: disable=W0236 , W0221
        return "Vertex AI Model"

# TODO(developer): Update the below lines
PROJECT_ID = "<your_project"
LOCATION = "your_region"

PROJECT_ID = "vertexai-pgt"
LOCATION = "us-central1"

# Initilialize safety filters for vertex model
safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

# Initialise the ChatVertexAI model
custom_model_gemini = ChatVertexAI(
    model_name="gemini-1.0-pro-002"
    , safety_settings=safety_settings
    , project=PROJECT_ID
    , location=LOCATION
    , response_validation=False #Important since deepval cannot handle validation errors
)

# initiatialize the Deepeval wrapper class
google_vertexai_gemini_deepeval = GoogleVertexAIDeepEval(model=custom_model_gemini)

Overwriting ./scripts/vertex_llm.py


In [59]:
%%writefile ./scripts/test_chatbot.py

import itertools
import pytest

from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from vertex_llm import google_vertexai_gemini_deepeval

#Evaluation set with questions and ground_truth
questions = ["What architecture is proposed in paper titled Attention is all you need?",
             "Where do primary authors of paper titled Attention is all you need work?"
            ]
ground_truth = ["Transformers architecture",
                "Google Brain"
              ]

# Convert into a dataset and prepare for consumption by DeepEval API
test_set = []
for (q, a) in itertools.zip_longest(questions, ground_truth):
    test_set.append({'Question' : q 
                    , 'Answer' : a 
                    , "Context" : None })

@pytest.mark.parametrize('record', test_set)
def test_answer_relevancy(record):
    answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5 
                                                    , model=google_vertexai_gemini_deepeval
                                                    )
    test_case = LLMTestCase(
        input=record['Question'],
        actual_output=record['Answer'],
        retrieval_context=record['Context']
    )
    assert_test(test_case, [answer_relevancy_metric])

Overwriting ./scripts/test_chatbot.py


In [60]:
#run the pytest scripts
!pytest scripts/

platform darwin -- Python 3.11.6, pytest-7.4.4, pluggy-1.3.0
rootdir: /Users/adityarane/Documents/Projects/gcp-generative-ai/language/rag_evaluation
plugins: deepeval-0.21.51, repeat-0.9.3, anyio-3.7.1, xdist-3.6.1
collected 2 items                                                              [0m[1m

scripts/test_chatbot.py [32m.[0m[32m.[0m[32m                                               [100%][0mRunning teardown with pytest sessionfinish[33m...[0m


Error in sys.excepthook:

Original exception was:


# Conclusion

In this notebook, you learned:

1. Deepeval - Framework for evaluation .
2. Making Deepeval Work with Vertex AI Gemini API
3. Integrating DeepEval with Pytest