In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex Agent Builder and Dialogflow CX Evaluations
In this notebook, we will show you how to:
1. Build a new Evaluation Dataset for your Agent.
2. Run the Evaluations to get Quality Metrics
3. Push the Results to Google Sheets for reporting.


## Prerequisites
- Existing Agent Builder or DFCX Agent w/ or w/out Tool calling.
<br>

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/dfcx-scrapi/blob/main/examples/vertex_ai_conversation/datastores_and_search.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/dfcx-scrapi/blob/main/examples/vertex_ai_conversation/datastores_and_search.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/dfcx-scrapi/blob/main/examples/vertex_ai_conversation/datastores_and_search.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

In [None]:
!pip install dfcx-scrapi>=1.12.0

import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    from google.auth import default

    auth.authenticate_user()
    credentials, _ = default()
else:
    # Otherwise, attempt to discover local credentials as described in
    # https://cloud.google.com/docs/authentication/application-default-credentials
    pass


# Evaluation Dataset Format

Collecting and evaluating multi-turn chat data can be complex, so we have devised template that you can follow to make it easy and scalable.

The evaluation dataset must be in a tabular format and contain the following columns:
- `eval_id` 
  - _Unique identifier of a conversation, which must be the same for each row that is part of the same conversation_
- `action_id` 
  - _Index of the specific action for the current conversation._
  - _This is used to track and pair responses during inference time._
- `action_type` 
  - _The specific action for this turn._
  - _There are currently 4 supported Action Types: `User Utterance`, `Agent Response`, `Playbook Invocation`, `Tool Invocation`_
- `action_input`
  - _The input for this specific action_type._
  - _Based on the specified action_type, this could be the expected user utterance or agent response, the expected Playbook name, or the expected Tool name._
- `action_input_parameters`
  - When `Playbook Invocation` or `Tool Invocation` is selected as the `action_type`, this refers to the payload of information that is expected to be sent with that invocation._
  - _For example, a JSON payload of key/value pairs called with the tool._
- `tool_action`
  - _This field is only used when `Tool Invocation` is chosen as the `action_type`._
  - _This allows us to run evaluations on whether the Tool call chose the correct internal action (if more than one exists)_

---

An example for the queryset can be seen in this table:

| eval_id | action_id | action_type | action_input | action_input_parameters | tool_action | notes |
|---|---|---|---|---|---|---|
| travel-ai-001 | 1 | User Utterance | Paris |  |  |  |
| travel-ai-001 | 2 | Playbook Invocation | Travel Inspiration |  |  |  |
| travel-ai-001 | 3 | Agent Response | Paris is a beautiful city! Here are a few things you might enjoy doing there:<br><br>Visit the Eiffel Tower<br>Take a walk along the Champs-Élysées<br>Visit the Louvre Museum<br>See the Arc de Triomphe<br>Take a boat ride on the Seine River |  |  |  |
| travel-ai-002 | 1 | User Utterance | I want to go to Barcelona with my family of four in June |  |  |  |
| travel-ai-002 | 2 | Playbook Invocation | Travel Inspiration |  |  |  |
| travel-ai-002 | 3 | Agent Response | I'd be happy to help you find a hotel in Barcelona for your family of four in June. What are your preferred dates of travel? |  |  |  |
| travel-ai-002 | 4 | User Utterance | 1st through 10th |  |  |  |
| travel-ai-002 | 5 | Playbook Invocation | Book Hotel |  |  |  |
| travel-ai-002 | 6 | Tool Invocation | hotel_tool | {'city': 'Barcelona', 'num_results': 10} | hotel_search |  |
---

# Data Loading
The preferred method for data loading is to use Google Sheets.  
However you can also manually build your dataset as a Pandas Dataframe, or load from CSV, BQ, etc. as needed.

## Option 1 - From Google Sheets

In [None]:
from dfcx_scrapi.tools.evaluations import DataLoader

data = DataLoader()

sheet_name = "[TEMPLATE] Vertex Agent Evals Dataset Format"
sheet_tab = "golden-agent-evals"

sample_df = data.from_google_sheets(sheet_name, sheet_tab)

## Option 2 - From Local CSV

In [None]:
from dfcx_scrapi.tools.evaluations import DataLoader

data = DataLoader()

csv_file_path = ""

sample_df = data.from_csv(csv_file_path)

## Option 3 - Manual Loading

In [None]:
import pandas as pd
from dfcx_scrapi.tools.evaluations import DataLoader

data = DataLoader()

INPUT_SCHEMA_REQUIRED_COLUMNS = ['eval_id', 'action_id', 'action_type', 'action_input', 'action_input_parameters', 'tool_action', 'notes']

sample_df = pd.DataFrame(columns=INPUT_SCHEMA_REQUIRED_COLUMNS)

sample_df.loc[0] = ["travel-ai-001", 1, "User Utterance", "Paris", "", "", ""]
sample_df.loc[1] = ["travel-ai-001", 2, "Playbook Invocation", "Travel Inspiration", "", "", ""]
sample_df.loc[2] = ["travel-ai-001", 3, "Agent Response", "Paris is a beautiful city! Here are a few things you might enjoy doing there:\n\nVisit the Eiffel Tower\nTake a walk along the Champs-Élysées\nVisit the Louvre Museum\nSee the Arc de Triomphe\nTake a boat ride on the Seine River", "", "", ""]

sample_df = data.from_dataframe(sample_df)

# Metrics
For multi-turn chat Agents w/ or w/out tool calling, there are currently 2 metrics supported:
1. `Response Similarity`, this performs a Semantic Similarity comparison between the expected "golden" response and the actual "predicted" response
2. `Tool Call Quality`, this performs and EXACT_MATCH on 2 components of the Tool call
    - Tool Name, i.e. was the correct Tool invoked
    - Tool Action, i.e. for the given Tool, was the correct Action / Endpoint invoked

Other metrics like `UrlMatch`, `Faithfulness`, `Answer Correctness`, `Context Recall` etc. will be supported in the future.

In [2]:
from dfcx_scrapi.tools.evaluations import Evaluations

# [1] Define your Agent ID here
agent_id = "projects/your-project/locations/us-central1/agents/11111-2222-33333-44444" # Example Agent

# [2] Instantiate Evals class w/ Metrics
evals = Evaluations(agent_id, metrics=["response_similarity", "tool_call_quality"])

# Predict and Evaluate
In this step, we will run all of the queries against the Agent that is being evaluated.  
Once the queries are returned, we will then compute all of the metrics.

In [3]:
eval_results = evals.run_query_and_eval(sample_df.head(10))

print(f"Average Similarity {eval_results.similarity.mean()}")
print(f"Average Tool Call Quality {eval_results.tool_name_match.mean()}")

In [None]:
# You can inspect the results as needed
eval_results.head()

# Write to Google Sheets
Storing the evaluation results to Google Sheets can be done with the following snippets.

In future revisions, we will add export to other format including `csv`, `bigquery`, etc.

In [None]:
from dfcx_scrapi.tools.evaluations import DataLoader

data = DataLoader()

data.write_eval_results_to_sheets(eval_results, sheet_name, results_tab="latest_results")
data.append_test_results_to_sheets(eval_results, sheet_name, summary_tab="reporting")

# Ending and Wrap-Up

In this notebook, we've shown how to programmatically Evaluate your Agent Builder or Dialogflow CX Agent.

For more information, see:
- [Vertex AI Agents](https://cloud.google.com/dialogflow/vertex/docs/concept/agents)
- [Dialogflow CX](https://cloud.google.com/dialogflow/cx/docs)