# Vertex AI Conversation - Evaluation Tool

This tool requires user's input in several steps. Please run the cells one by one (Shift+Enter) to ensure all the steps are successfully completed.

# Setup


In [None]:
# @markdown `install packages`
!pip install dfcx-scrapi --quiet

# workaround until vertexai import is fixed
!pip uninstall bigframes -y --quiet
!pip install bigframes==0.26.0 --quiet

In [None]:
import sys
import json
import dataclasses
import vertexai
import pandas as pd

from dfcx_scrapi.tools.datastore_scraper import DataStoreScraper, load_spreadsheet
from dfcx_scrapi.tools.datastore_evaluator import DataStoreEvaluator, EvaluationVisualizer, EvaluationResult

In [None]:
# @markdown `authenticate`

if "google.colab" in sys.modules:
    from google.auth import default
    from google.colab import auth
    from google.colab import files

    auth.authenticate_user()
    credentials, _ = default()
else:
    # Otherwise, attempt to discover local credentials as described in
    # https://cloud.google.com/docs/authentication/application-default-credentials
    pass

# Evaluation

## Initialization

In [None]:
# @markdown `initialize vertex ai`

# @markdown > The project selected will be billed for calculating evaluation
# @markdown metrics that require large language models. It should have the
# @markdown [Vertex AI API](https://cloud.google.com/vertex-ai/docs/featurestore/setup)
# @markdown enabled. The LLM-based metrics use PaLM 2 for Text (Text Bison).
# @markdown For pricing information see this [page](https://cloud.google.com/vertex-ai/generative-ai/pricing).

vertex_ai_project_id = ""  # @param{type: 'string'}
vertex_ai_location = ""  # @param{type: 'string'}

vertexai.init(
    project=vertex_ai_project_id,
    location=vertex_ai_location,
    credentials=credentials,
)

In [None]:
# @markdown `run this cell to initialize Dialogflow CX agent scraper`
# @markdown > This cell initializes the agent with the provided parameters.
# @markdown `project_id`, `location` and `agent_id` can be defined through one
# @markdown of the following options, while `language_code` must be defined
# @markdown in either case. The parameters for a given agent can be found in
# @markdown the DialogflowCX console url:
# @markdown `https://dialogflow.cloud.google.com/cx/projects/`**`{project_id}`**`/locations/`**`{location}`**`/agents/`**`{agent_id}`**`/intents`

# @markdown ---
# @markdown ### Option 1. - Provide agent_id directly
# @markdown Format: `projects/<Project ID>/locations/<Location ID>/agents/<Agent ID>`

agent_id = ""  # @param {type: "string"}

# @markdown ---
# @markdown ### Option 2. - Parse agent_id from url
# @markdown > **NOTE** : if `agent_url` is provided then it has precedence over
# @markdown directly provided agent_id.

agent_url = "" # @param {type: "string"}

if agent_url:
  scraper = DataStoreScraper.from_url(agent_url=agent_url, creds=credentials)
else:
  scraper = DataStoreScraper(agent_id=agent_id, creds=credentials)

In [None]:
# test agent on a single query
response = scraper.scrape_detect_intent(query="who is the ceo?")
print(json.dumps(dataclasses.asdict(response), indent=4))

## Data Loading

The queryset must be in a tabular format that has to contain the following columns:
- `conversation_id` _(unique identifier of a conversation, which must be the same for each row that are part of the same conversation)_
- `turn_index` _(index of the query - expected answer pair within a conversation)_
- `query` _(the input question)_
- `expected_answer` _(the ideal or ground truth answer)_
- `expected_uri` _(the webpage url or more generally the uri that contains the answer to `query`)_.
- `user_metadata` _(optional user metadata passed to datastore route with user query. Can be one of [str, dict])_
- `parameters` _(optional session parameters to include with the user query. Can be one of [str, dict])_

In addition to the required columns the RougeL metric can also use the following optional column:

- `golden_snippet` _(the extracted document snippet or segment that contains the `expected_answer`)_

An example for the queryset can be seen in this table:

| conversation_id | turn_index | query | expected_answer | expected_uri | user_metadata | parameters
| --- | --- | --- | --- | --- | --- | --- |
| 0 | 1 | What is the capital of France? | Capital of France is Paris. | exampleurl.com/france | None | None |
| 0 | 2 | How many people live there? | 2.1 million people live in Paris. | exampleurl.com/paris | {"some_end_user_key": "some_value"} | {"param1": 1, "param2": "some_string"} |
| 1 | 1 | What is the color of the sky? | It is blue. | exampleurl.com/common | None | None |
| 2 | 1 | How many legs does an octopus have? | It has 8 limbs. | exampleurl.com/octopus | None | None |

---

Choose one of the following 3 options to load the queryset:



### Option 1. - Manual

In [None]:
# @markdown `run this cell to load data manually`
INPUT_SCHEMA_REQUIRED_COLUMNS = [
    "conversation_id", "turn_index", "query", "expected_answer", "expected_uri", "user_metadata", "parameters"
]

sample_df = pd.DataFrame(columns=INPUT_SCHEMA_REQUIRED_COLUMNS)

sample_df.loc[0] = ["0", 1 ,"Who are you?", "I am an assistant", "www.google.com", None, None]
sample_df.loc[1] = ["1", 1 ,"Who is the CEO?", "The CEO is Matt Reinjes.", "www.yeti.com", {"some_end_user_key": "some_value"}, {"param1": 1, "param2": "some_string"}]
sample_df.loc[2] = ["1", 2, "How much?", "The Basic plan costs 20$/month", "www.google.com", None, None]
queryset = sample_df

---
### Option 2. - From local .csv

In [None]:
# @markdown `run this cell to load the queryset from a .csv file in the filesystem`

csv_path = ""  # @param{type: 'string'}

queryset = pd.read_csv(csv_path)
queryset = queryset.fillna("")

if "user_metadata" in queryset.columns:
  queryset = queryset.assign(
      user_metadata=queryset["user_metadata"].apply(lambda p:p if p != "" else None)
    )
else:
  queryset = queryset.assign(user_metadata=None)

---
### Option 3. - From Google Sheets

In [None]:
# @markdown `run this cell to load the queryset from Google Sheets`

sheet_url = "" # @param {type: "string"}
worksheet_name = "" # @param {type: "string"}
# @markdown > **NOTE**: if `worksheet_name` is not provided then `Sheet1` is used
# @markdown by default.

_worksheet_name = worksheet_name if worksheet_name else "Sheet1"

queryset = load_spreadsheet(sheet_url, _worksheet_name, credentials)

if "user_metadata" in queryset.columns:
  queryset = queryset.assign(
      user_metadata=queryset["user_metadata"].apply(lambda p:p if p != "" else None)
    )
else:
  queryset = queryset.assign(user_metadata=None)



## Scraping Responses

In [None]:
# run scraping
scrape_result = scraper.run(queryset)

## Metric Definition

Select the metrics that should be computed during evaluation.

> **NOTE** : Remember to rerun the cell below (Shift+Enter) after clicking the checkbox of the metric.

In [None]:
URL_MATCH = True # @param {type: "boolean"}
# @markdown url match metric computes the following boolean type columns:
# @markdown - `cited_url_match@1` - is `expected_url` same as the first link returned by Vertex AI Conversation
# @markdown - `cited_url_match` - is `expected_url` part of the links returned by Vertex AI Conversation
# @markdown - `search_url_match` - is `expected_url` part of the search results that are shown to generative model in Vertex AI Conversation
# @markdown
# @markdown ---
ROUGEL = True # @param {type: "boolean"}
# @markdown rougeL metric computes a score between `[0, 1]` (higher is better) based on
# @markdown the longest common word subsequence between different targets and predictions:
# @markdown
# @markdown _(`cited_search_results` are the search snippets which were cited by answer generator llm)_
# @markdown - `rougeL_generative` - Compares `expected_answer` to `answer_text`.
# @markdown - `rougeL_extractive` - (only computed if
# @markdown `golden_snippet` is part of the dataset) Compares `golden_snippet` to `answer_snippets[0]`
# @markdown
# @markdown ---
ANSWER_CORRECTNESS = True # @param {type: "boolean"}
# @markdown LLM-based autoeval metric that compares `expected_answer` to `answer_text`. The metric
# @markdown makes approximately 5-10 LLM calls for each for each row of the dataset depening on `expected_answer` and `answer_text` length.
# @markdown It returns 3 output columns containing scores between `[0, 1]` (higher is better):
# @markdown - `answer_correctness_recall` - How well does `answer_text`'s information content cover `expected_answer`.
# @markdown - `answer_correctness_precision` - How much of `answer_text`'s information content is required based on `expected_answer`.
# @markdown - `answer_correctness_f1` - The harmonic mean of recall and precision.
# @markdown
# @markdown ---
FAITHFULNESS = True # @param {type: "boolean"}
# @markdown LLM-based autoeval metric that provides a score between `[0, 1]` (higher is better)
# @markdown with regard to how well `answer_text` is attributed to `search_snippets`. It makes approximately 5
# @markdown LLM calls for each row of the dataset depending on the length of `answer_text`.
# @markdown - `faithfulness_gmean`
# @markdown
# @markdown ---
CONTEXT_RECALL = True # @param {type: "boolean"}
# @markdown LLM-based autoeval metric that provides a score between `[0, 1]` (higher is better)
# @markdown for how well the `expected_answer` is attributed to `search_snippets`. In other words this metric
# @markdown scores the quality of search by measuring how well the `expected_answer` can be generated from the `search_snippets`.
# @markdown It makes approximately 5 LLM calls for each row of the dataset depending on the length of `expected_answer`.
# @markdown - `context_recall_gmean`
# @markdown
# @markdown ---

metrics = []

if URL_MATCH: metrics.append("url_match")
if ROUGEL: metrics.append("rougeL")
if ANSWER_CORRECTNESS: metrics.append("answer_correctness")
if FAITHFULNESS: metrics.append("faithfulness")
if CONTEXT_RECALL: metrics.append("context_recall")

evaluator = DataStoreEvaluator(metrics=metrics)

## Computing Metrics

In [None]:
# @markdown `evaluation results`
evaluation_result = evaluator.run(scrape_result)

## Export results

### Option 1. - Display

In [None]:
# @markdown `run this cell to display evaluation results`
Number_of_rows = 3 # @param {type: "integer"}


results=evaluation_result.display_on_screen()
results.head(Number_of_rows)

### Option 2. - To local.csv and download to your system

In [None]:
# @markdown `run this cell to export evaluation results into Google Sheets`

FILE_NAME = "evaluation_results.csv" # @param {type: "string"}

filepath = evaluation_result.export_to_csv(FILE_NAME)

# Prompt user to download the file
print(f"CSV file created at: {filepath}")
print("Would you like to download the file? (y/n)")
user_choice = input().lower()

if user_choice == "y":
  # Download the file using Colab's download feature
  files.download(filepath)
  print("File downloaded successfully!")
else:
  print("Download skipped.")

### Option 3. - To Google Sheets

In [None]:
# @markdown `run this cell to export evaluation results into Google Sheets`

FOLDER_NAME = "result" # @param {type: "string"}
CHUNK_SIZE = 50 # @param {type: "number"}
WITH_TIMESTAMP = True # @param {type: "boolean"}

_folder_name = (
    f"{FOLDER_NAME}_{evaluation_result.timestamp}"
    if WITH_TIMESTAMP else
    FOLDER_NAME
)

folder_url = evaluation_result.export(_folder_name, CHUNK_SIZE, credentials)
print(f"Exported results to folder: {folder_url}")

### Option 4. - To Bigquery



In [None]:
BQ_PROJECT_ID="" # @param {type: "string"}
BQ_DATASET_ID="" # @param {type: "string"}
BQ_TABLE_NAME ="" # @param {type: "string"}


filepath = evaluation_result.export_to_bigquery(BQ_PROJECT_ID,BQ_DATASET_ID,BQ_TABLE_NAME,credentials)

## Result Visualization

In [None]:
# @markdown `Folder url`
FOLDER_URLS = [
    folder_url, # latest evaluation
    # add previous evaluations e.g: https://drive.google.com/drive/folders/<id>
]

In [None]:
# @markdown `define evaluation visualizer`

results = []
for folder_url in FOLDER_URLS:
  er = EvaluationResult()
  er.load(folder_url, credentials)
  results.append(er)

evaluation_visualizer = EvaluationVisualizer(results)

In [None]:
# @markdown `radar plot of autoeval metrics`
from dfcx_scrapi.tools.metrics import StatementBasedBundledMetric, RougeL

evaluation_visualizer.radar_plot(StatementBasedBundledMetric.COLUMNS)

In [None]:
# @markdown `response type distribution`

evaluation_visualizer.count_barplot("response_type")

In [None]:
# @markdown `average RougeL`

evaluation_visualizer.mean_barplot(column_names=RougeL.COLUMNS)