In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Auto Alignment Evaluation of LLM Output

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2F2025%2Fgenerative-ai%2Fgemini%2Fevaluation%2Fauto_alignment_evaluation_of_llm_output.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

In [1]:
# TODO: REMOVE THIS CELL FROM YOUR NOTEBOOK ###

import re
from urllib.parse import quote

from IPython.display import Markdown, display


def generate_html(file_path: str):
    match = re.search(
        r"(?:https://)?(?:github\.com/)?(?:GoogleCloudPlatform/)?(?:generative-ai/)?(?:blob/)?(?:main/)?([\w/-]+.ipynb)",
        file_path,
    )
    if not match:
        return "Could not generate table."

    file_path = match.group(1)

    base_url = "https://github.com/GoogleCloudPlatform/generative-ai/blob/main/"
    raw_github_url = (
        "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/"
    )

    colab_url = "https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/"
    colab_enterprise_url = f"https://console.cloud.google.com/vertex-ai/colab/import/{raw_github_url.replace('/', '%2F')}"
    vertex_ai_url = f"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url={raw_github_url}"
    bigquery_studio_url = (
        f"https://console.cloud.google.com/bigquery/import?url={base_url}"
    )

    linkedin_url = "https://www.linkedin.com/sharing/share-offsite/?url="
    bluesky_url = "https://bsky.app/intent/compose?text="
    twitter_url = "https://twitter.com/intent/tweet?url="
    reddit_url = "https://reddit.com/submit?url="
    facebook_url = "https://www.facebook.com/sharer/sharer.php?u="

    encoded_url = quote(f"{base_url}{file_path}")

    html = f"""
```html
<table align="left">
  <td style="text-align: center">
    <a href="{colab_url}{file_path}">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="{colab_enterprise_url}{file_path.replace('/', '%2F')}">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="{vertex_ai_url}{file_path}">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>"""

    # Add BigQuery Studio link only if the flag is True
    if INCLUDE_BIGQUERY_STUDIO:
        html += f"""
  <td style="text-align: center">
    <a href="{bigquery_studio_url}{file_path}">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg" alt="BigQuery Studio logo"><br> Open in BigQuery Studio
    </a>
  </td>"""

    html += f"""
  <td style="text-align: center">
    <a href="{base_url}{file_path}">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="{linkedin_url}{encoded_url}" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="{bluesky_url}{encoded_url}" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="{twitter_url}{encoded_url}" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="{reddit_url}{encoded_url}" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="{facebook_url}{encoded_url}" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
```"""
    return html


# File path from the repository root
file_path = "imported/PANW PSO/NEXT 2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb"  # @param {type:"string"}

# Include link to Open in BigQuery Studio
INCLUDE_BIGQUERY_STUDIO = False  # @param {type:"boolean"}
display(Markdown(generate_html(file_path)))


```html
<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2F2025%2Fgenerative-ai%2Fgemini%2Fevaluation%2Fauto_alignment_evaluation_of_llm_output.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
```

| Author(s) |
| --- |
| [Jennifer Liang](https://github.com/jenniferliangc) |

## Overview

This tutorial brings a new way to evaluate LLM performance against ground truth. We build a customizable, line-by-line automated evaluator for use cases where high precision is required. This method eliminates the need for repetitive prompt tuning, minimizes hallucinations, and ensures repeatable and accurate results.


In this tutorial, we use a recipe dataset that has ground truth and LLM outputs. We will perform two phases of evaluation:

1. Rephraser Evaluator (0-1 point): use a semantic similarity model to check similarity on LLM output vs ground truth.

2. Final Answer Evaluator (0-5 points): we have three criterias to evaluate in this phase:
> - a) Source Score (0-1 point): did the LLM choose the same source (source 1, 2, 3, etc.) as in the ground truth?
> - b) Ingredient Sentence Score (0-2 points): sentence-level comparison to check if LLM outputted the same ingredient list as in the ground truth. 
> - c) Instruction Sentence Score (0-2 points): sentence-level comparison to check if LLM outputted the same instructions as in the ground truth.
> - For a, b, and c, penalties are added for any missed ground truth sources or sentences. A penalty is also added for any extra sources or sentences that the LLM produces not present in the ground truth

## Get started

### Install Google Gen AI SDK and other required packages


In [1]:
%pip install --upgrade --quiet google-cloud-aiplatform


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

from google import genai

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Import libraries

In [2]:
from IPython.display import Markdown, display

import json
import logging
import time
import uuid
import pandas as pd
import numpy as np
import os, sys, vertexai
import regex as re
import json
import ast
import requests
from typing import NamedTuple
from google.cloud import aiplatform

# Logging
logger = logging.getLogger("logger")
logging.basicConfig(level=logging.WARNING)

## Tutorial Start

### Load Dataset

In [4]:
df = pd.read_csv("asian_chef_advisor_dataset.csv")

In [5]:
df.head(2)

Unnamed: 0,case_id,query,ground_truth_rephrased_query,gemini_rephrased_query_llm,ground_truth_final_answer,gemini_final_answer_llm,gemini_alternative_answer,gemini_alternative_rephrased_query_llm
0,1,I have a lot of tofu and I want to make a Chin...,Sweet sour tofu bell pepper onion Chinese stir...,Sweet and sour tofu stir-fry with bell peppers...,Recipe 1: Mapo Tofu\n\nIngredients:\n\n½ cup o...,Recipe 1: (From Document 1). Best Dish: Mapo T...,"Okay, I understand. I will analyze the search ...",Tofu stir-fry recipe with sweet and sour sauce...
1,2,I have some dried shiitake mushrooms and I'd l...,Shiitake mushroom pork belly star anise braise...,"Shiitake mushroom, pork belly, star anise, soy...",Best Dish: Braised Pork and Egg with Rice\n\nR...,Recipe 1: Braised Pork and Egg with Rice\n\nIn...,Recipe 1: SHANGHAI BRAISED PORK BELLY\n\nIngre...,Chinese braised pork belly with shiitake mushr...


### Rephraser Evaluator

In [7]:
# Function to calculate cosine similarity between two sentences
def calculate_similarity(llm_rephraser, gt_rephraser):
    embeddings1 = model.encode(llm_rephraser, convert_to_tensor=True)
    embeddings2 = model.encode(gt_rephraser, convert_to_tensor=True)
    cosine_sim = util.cos_sim(embeddings1, embeddings2)
    return cosine_sim.item()

In [8]:
df['rephraser_semantic_similarity'] = df.apply(lambda row: calculate_similarity(row['gemini_final_answer_llm'], row['ground_truth_rephrased_query']), axis=1)

In [None]:
df.head(2)

### Final Answer Evaluator

#### Source Scoring

In [9]:
# Extract Recipe Number Source

def find_recipe_number(text):
    """
    Finds and extracts unique recipe numbers from a given text string.

    The function searches for the pattern "Recipe " followed by one or more digits.
    It returns a sorted list of the unique recipe numbers found in the text.
    If the input text is NaN or if no recipe numbers are found, an empty list is returned.

    Args:
        text (str): The input string to search for recipe numbers.

    Returns:
        list: A sorted list of unique recipe numbers (as strings) found in the text.
              Returns an empty list if no recipe numbers are found or if the input
              text is pandas.NA.
    """ 
    if pd.isna(text):
        return []
    else:
        recipe_numbers = []
        pattern = r"Recipe (\d+)"
        matches = re.findall(pattern, text)
        if matches:
            recipe_numbers.extend(matches)
            return sorted(list(set(recipe_numbers)))
        else:
            return []

In [10]:
df['gt_sources_extracted'] = df['ground_truth_final_answer'].apply(find_recipe_number)
df['llm_response_sources'] = df['gemini_final_answer_llm'].apply(find_recipe_number)

In [11]:
df.head(2)

Unnamed: 0,case_id,query,ground_truth_rephrased_query,gemini_rephrased_query_llm,ground_truth_final_answer,gemini_final_answer_llm,gemini_alternative_answer,gemini_alternative_rephrased_query_llm,rephraser_semantic_similarity,gt_sources_extracted,llm_response_sources
0,1,I have a lot of tofu and I want to make a Chin...,Sweet sour tofu bell pepper onion Chinese stir...,Sweet and sour tofu stir-fry with bell peppers...,Recipe 1: Mapo Tofu\n\nIngredients:\n\n½ cup o...,Recipe 1: (From Document 1). Best Dish: Mapo T...,"Okay, I understand. I will analyze the search ...",Tofu stir-fry recipe with sweet and sour sauce...,0.686584,[1],[1]
1,2,I have some dried shiitake mushrooms and I'd l...,Shiitake mushroom pork belly star anise braise...,"Shiitake mushroom, pork belly, star anise, soy...",Best Dish: Braised Pork and Egg with Rice\n\nR...,Recipe 1: Braised Pork and Egg with Rice\n\nIn...,Recipe 1: SHANGHAI BRAISED PORK BELLY\n\nIngre...,Chinese braised pork belly with shiitake mushr...,0.731325,[1],[1]


In [12]:
# SOURCE SCORE CALCULATION

def source_score(llm_sources, gt_sources):
    """
    Calculates a source score by comparing the sources provided by an LLM
    to the ground truth (GT) sources.

    The score is based on the number of correctly identified sources, with penalties
    for incorrect LLM-provided sources and missed ground truth sources.

    Args:
        llm_sources (list): A list of sources provided by the LLM.
        gt_sources (list): A list of ground truth sources.

    Returns:
        tuple: A tuple containing the following five float values:
            - source_score: The calculated source score (capped at 0).
            - points_per_source: The base points awarded for each correct ground truth source.
            - correct_sources_score: The total score from correctly identified sources
              before penalties.
            - incorrect_source_penalty: The penalty applied for incorrect sources
              provided by the LLM.
            - missed_source_penalty: The penalty applied for ground truth sources
              that were not identified by the LLM.
    """

    if not llm_sources or not gt_sources:
        return 0, 0, 0, 0, 0

    # SOURCE SCORE CALCULATION

    # Calculate points per source
    if len(gt_sources) == 0:
        points_per_source = 0.00
    else:
        points_per_source = round(1 / len(gt_sources), 2)

    # Count correct and incorrect matches using set operation
    correct_sources = len(set(gt_sources) & set(llm_sources))
    incorrect_llm_sources = len(set(llm_sources) - set(gt_sources)) #finds the values that are in llm_sources but NOT in gt_sources
    missed_gt_sources = len(set(gt_sources) - set(llm_sources)) #finds the values that are in gt_sources but NOT in llm_sources

    # Define penalty values (adjust penalty % as needed)
    incorrect_source_penalty = 0.10 * points_per_source * incorrect_llm_sources # Per incorrect LLM source
    missed_source_penalty = 0.20 * points_per_source * missed_gt_sources # Per missed GT source
    correct_sources_score = correct_sources * points_per_source

    # Calculate the score and ensure it doesn't go below zero
    source_score = round(max(0, (correct_sources_score -
                                 incorrect_source_penalty -
                                 missed_source_penalty)), 2)

    return source_score, points_per_source, correct_sources_score, incorrect_source_penalty, missed_source_penalty

In [13]:
df[['source_score', 'points_per_source', 'correct_sources_score', 'incorrect_source_penalty', 'missed_source_penalty']] = df.apply(
lambda row: source_score(row['llm_response_sources'], row['gt_sources_extracted']), axis=1, result_type="expand")

#### Ingredients and Instruction Scoring

In [14]:
def extract_instructions_regex(text: str) -> str:
    # Regex to find text after "Instructions:"
    match = re.search(r"instructions?\s*(.*)", text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip().replace("{", "").replace("}", "").replace("•", "").replace(":", "")
    return ""

def extract_ingredients_regex(text):
    # Regex to find text between "Ingredients" and "Instructions"
    match = re.search(r"ingredients?\s*(.*?)(?:\s*[\"'\n]*\s*instructions:?)", text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip().replace("{", "").replace("}", "").replace("•", "").replace(":", "")
    return ""

def extract_ingredients_and_instructions(llm_text, gt_text, llm_sources, gt_sources):
    """
    Extracts ingredients and instructions from LLM-generated text and ground truth text,
    focusing on the content associated with correctly identified sources.

    The function first identifies the common sources between the LLM's output and the
    ground truth. Then, it extracts the full ingredients and instructions blocks from
    both the LLM text and the ground truth text using regular expressions
    (via `extract_ingredients_regex` and `extract_instructions_regex`).
    The extracted ingredient and instruction blocks are then duplicated for each
    correctly identified source, creating parallel lists.

    Args:
        llm_text (str): The text generated by the Language Model.
        gt_text (str): The ground truth text.
        llm_sources (List[str]): A list of sources cited by the LLM.
        gt_sources (List[str]): A list of ground truth sources.

    Returns:
        Tuple[List[str], List[str], List[str], List[str]]: A tuple containing four lists:
            - llm_ingredients_text (List[str])
            - llm_instructions_text (List[str])
            - gt_ingredients_text (List[str])
            - gt_instructions_text (List[str])
            Returns four empty lists if any of the input texts or source lists are empty.
    """

    if not llm_text or not llm_sources or not gt_sources:
        return [], [], [], []

    correct_sources = sorted(list(set(gt_sources) & set(llm_sources)))

    llm_ingredients_text = []
    gt_ingredients_text = []
    
    llm_instructions_text = []
    gt_instructions_text = []

    # --- Process LLM Text ---
    llm_ingredients_block = extract_ingredients_regex(llm_text)
    if not llm_ingredients_block:
        llm_ingredients_block = ""
    llm_instructions_block = extract_instructions_regex(llm_text)
    if not llm_instructions_block:
        llm_instructions_text = ""

    # --- Process GT Text ---
    gt_ingredients_block = extract_ingredients_regex(gt_text)
    if not gt_ingredients_block:
        gt_ingredients_block = ""
    gt_instructions_block = extract_instructions_regex(gt_text)
    if not gt_instructions_block:
        gt_instructions_block = ""

    for source in correct_sources:
        llm_ingredients_text.append(llm_ingredients_block)
        llm_instructions_text.append(llm_instructions_block)
        
        gt_ingredients_text.append(gt_ingredients_block)
        gt_instructions_text.append(gt_instructions_block)

    return llm_ingredients_text, llm_instructions_text, gt_ingredients_text, gt_instructions_text

In [15]:
df[['llm_ingredients_text', 'llm_instructions_text', 'gt_ingredients_text', 'gt_instructions_text']] = df.apply(
        lambda row: extract_ingredients_and_instructions(row['gemini_final_answer_llm'], row['ground_truth_final_answer'], row['llm_response_sources'],
                                                         row['gt_sources_extracted']), axis=1, result_type="expand")

In [16]:
# Converting each list into a single string joined by a newline character and store it in a new column

df['gt_ingredients_text_split'] = df['gt_ingredients_text'].apply(
lambda item: "\n".join(item) if isinstance(item, list) else "")

df['gt_instructions_text_split'] = df['gt_instructions_text'].apply(
lambda item: "\n".join(item) if isinstance(item, list) else "")

df['llm_ingredients_text_split'] = df['llm_ingredients_text'].apply(
    lambda item: "\n".join(item) if isinstance(item, list) else "")

df['llm_instructions_text_split'] = df['llm_instructions_text'].apply(
    lambda item: "\n".join(item) if isinstance(item, list) else "")

In [17]:
# Function to produce Ingredients & Instructions Score

def evaluate_content(llm_text, gt_text, casenum, threshold_in):
    """
    Evaluates the semantic similarity between sentences in LLM-generated text and
    ground truth text to produce a content score.

    The function splits both texts into sentences, encodes them using a SentenceTransformer model,
    and calculates cosine similarity between all pairs of LLM and ground truth sentences.
    It then matches sentences based on a provided similarity threshold and calculates a score
    that rewards matched sentences and penalizes extra LLM-generated sentences and
    missing ground truth sentences. Transition words (defined in `ignore_words`) are
    excluded from penalty calculations.

    Args:
        llm_text (str): The text generated by the Language Model.
        gt_text (str): The ground truth text.
        casenum (str): A unique identifier for the case being evaluated. This is
            included in the output DataFrame.
        threshold_in (float): The cosine similarity threshold (between 0 and 1)
            above which an LLM sentence is considered a match for a ground truth sentence.

    Returns:
        pandas.DataFrame: A DataFrame with a single row containing the evaluation metrics:
            - 'case_id': The provided `casenum`.
            - 'sentences_score': The overall content score (between 0 and 2).
            - 'num_gt_sentences': The total number of sentences in the ground truth text.
            - 'points_per_sentence': The base points awarded per matched ground truth sentence.
            - 'num_llm_sentences': The total number of sentences in the LLM-generated text.
            - 'correct_sentences_score': The total score from correctly matched sentences.
            - 'extra_sentences_penalty': The penalty applied for extra sentences in the LLM text.
            - 'gt_not_in_llm_penalty': The penalty applied for ground truth sentences not found in the LLM text.
            - 'num_equal_sentences': The number of exactly matching sentences (case-sensitive).
            - 'matched_sentences_count': The number of ground truth sentences with a similarity above the threshold in the LLM text.
            - 'gt_not_in_llm_count': The number of ground truth sentences with no similar counterpart in the LLM text (above the threshold, excluding transition sentences).
            - 'extra_sentences_count': The number of LLM sentences with no similar counterpart in the ground truth text (above the threshold, excluding transition sentences).
            - 'gt_transition_sentence_count': The number of ground truth sentences identified as transition sentences (based on `ignore_words`).
            - 'llm_transition_sentence_count': The number of LLM sentences identified as transition sentences.
            - 'gt_sentences': The ground truth text with sentences separated by newlines.
            - 'llm_sentences': The LLM-generated text with sentences separated by newlines.
            - 'matched_sentences': Pairs of matched ground truth and LLM sentences, each pair separated by a newline.
            - 'extra_sentences': LLM sentences that did not meet the similarity threshold with any ground truth sentence.
            - 'gt_not_in_llm_sentences': Ground truth sentences that did not meet the similarity threshold with any LLM sentence.

    Raises:
        Exception: If any error occurs during the evaluation process, the error is printed,
            and a DataFrame with default zero/empty values is returned.
    """
    
    model = SentenceTransformer('all-mpnet-base-v2')
    threshold = threshold_in

    try:
        if len(llm_text) > 0 and len(gt_text) > 0 and llm_text != "None" and gt_text != "None":
            # Split ground truth and LLM text into sentences
            gt_sentences = gt_text.split("\n")
            gt_sentences = [sent.strip() for sent in gt_sentences if sent.strip()]

            llm_sentences = llm_text.split("\n")
            llm_sentences = [sent.strip() for sent in llm_sentences if sent.strip()]

            # Encode LLM and ground truth sentences
            llm_embeddings = model.encode(llm_sentences)
            gt_embeddings = model.encode(gt_sentences)

            # Calculate similarity for each LLM sentence to all ground truth sentences
            similarities = util.cos_sim(llm_embeddings, gt_embeddings)

            # Initialize lists to store matched and extra sentences
            matched_sentences = []
            extra_sentences = []
            gt_not_in_llm_sentences = []

            # Count sentences with high similarity and penalize for extra sentences
            matched_sentences_count = 0
            extra_sentences_count = 0
            gt_not_in_llm_count = 0
            gt_transition_sentence_count = 0
            llm_transition_sentence_count = 0

            # To ignore transition sentences on penalty calculation
            ignore_words = [",", "\""]
            ignore_pattern = re.compile(r"|".join(ignore_words), re.IGNORECASE)  # Case-insensitive pattern

            for gt_index, gt_sentence in enumerate(gt_sentences):
                max_sim_index = np.argmax(similarities[:, gt_index])
                if similarities[max_sim_index, gt_index] > threshold:
                    matched_sentences_count += 1
                    matched_pair = (
                        f"{gt_sentence}\n",
                        f"{llm_sentences[max_sim_index]}\n"
                    )
                    matched_sentences.append(matched_pair)
                else:
                    if ignore_pattern.search(gt_sentence):
                        gt_transition_sentence_count += 1
                    else:
                        gt_not_in_llm_count += 1
                        gt_not_in_llm_sentences.append(gt_sentence + "\n")

            for i, llm_similarities in enumerate(similarities):
                if max(llm_similarities) < threshold:
                    if ignore_pattern.search(llm_sentences[i]):
                        llm_transition_sentence_count += 1
                    else:
                        extra_sentences_count += 1
                        extra_sentences.append(llm_sentences[i] + "\n")


            if (len(gt_sentences) - gt_transition_sentence_count) == 0:
                points_per_sentence = 0.00
            else:
                points_per_sentence = round(2 / (len(gt_sentences) - gt_transition_sentence_count), 4)

            # Calculate the score
            correct_sentences_score = matched_sentences_count * points_per_sentence
            extra_sentences_penalty = extra_sentences_count * 0.10 * points_per_sentence # Apply penalty for extra sentences
            gt_not_in_llm_penalty = gt_not_in_llm_count * 0.20 * points_per_sentence # Apply penalty for sentences in GT not in LLM

            # Store number of sentences
            num_gt_sentences = len(gt_sentences)
            num_llm_sentences = len(llm_sentences)

            # Count the number of exactly matching sentences
            num_equal_sentences = sum(1 for gt, llm in matched_sentences if gt.strip() == llm.strip())

            score = round(max(0, min(2, correct_sentences_score - extra_sentences_penalty - gt_not_in_llm_penalty)), 3)

        else:
            points_per_sentence = 0
            score = 0
            gt_sentences = []
            llm_sentences = []
            correct_sentences_score = 0
            extra_sentences_penalty = 0
            num_gt_sentences = 0
            num_llm_sentences = 0
            matched_sentences = []
            extra_sentences = []
            num_equal_sentences = 0
            gt_not_in_llm_sentences = []
            gt_not_in_llm_count = 0
            gt_not_in_llm_penalty = 0
            matched_sentences_count = 0
            extra_sentences_count = 0
            gt_transition_sentence_count = 0
            llm_transition_sentence_count = 0

    except Exception as e:  # Broad exception handling to catch any errors
        print(f"Error in case {casenum}: {e}")  # Log the error for debugging

        # Return empty strings/zeros for all columns
        data = {
            'Case Number': casenum,
            'sentences_score': 0,
            'num_gt_sentences': 0,
            'points_per_sentence': 0,
            'num_llm_sentences': 0,
            'correct_sentences_score': 0,
            'extra_sentences_penalty': 0,
            'gt_not_in_llm_penalty': 0,
            'num_equal_sentences': 0,
            'matched_sentences_count': 0,
            'gt_not_in_llm_count': 0,
            'extra_sentences_count': 0,
            'gt_transition_sentence_count': 0,
            'llm_transition_sentence_count': 0,
            'gt_sentences': "",
            'llm_sentences': "",
            'matched_sentences': "",
            'extra_sentences': "",
            'gt_not_in_llm_sentences': ""
        }
        return pd.DataFrame(data, index=[0])

    data = {
        # MAKE SURE YOU'RE GIVING 'casenum' THE SAME NAME AS in YOUR 'df' AS YOU WILL BE MERGING LATER
        'case_id': casenum,
        'sentences_score': max(0, min(2, score)),
        'num_gt_sentences': num_gt_sentences,
        'points_per_sentence': points_per_sentence,
        'num_llm_sentences': num_llm_sentences,
        'correct_sentences_score': correct_sentences_score,
        'extra_sentences_penalty': extra_sentences_penalty,
        'gt_not_in_llm_penalty': gt_not_in_llm_penalty,
        'num_equal_sentences': num_equal_sentences,
        'matched_sentences_count': matched_sentences_count,
        'gt_not_in_llm_count': gt_not_in_llm_count,
        'extra_sentences_count': extra_sentences_count,
        'gt_transition_sentence_count' : gt_transition_sentence_count,
        'llm_transition_sentence_count' : llm_transition_sentence_count,

        # Join list elements into a single string
        'gt_sentences': "\n".join(gt_sentences),
        'llm_sentences': "\n".join(llm_sentences),
        'matched_sentences': "\n".join([f"{gt}\n{llm}" for gt, llm in matched_sentences]),
        'extra_sentences': "\n".join(extra_sentences),
        'gt_not_in_llm_sentences': "\n".join(gt_not_in_llm_sentences)
    }

    return pd.DataFrame(data, index=[0])

In [18]:
# Semantic Similarity Threshold default value is 85%

results_ingredients = df.apply(
lambda row: evaluate_content(row['llm_ingredients_text_split'], row['gt_ingredients_text_split'], row['case_id'], 0.85), axis=1)

In [19]:
# Semantic Similarity Threshold default value is 75%

results_instructions = df.apply(
lambda row: evaluate_content(row['llm_instructions_text_split'], row['gt_instructions_text_split'], row['case_id'], 0.75), axis=1)

In [20]:
df_ingredients = pd.concat(results_ingredients.tolist(), ignore_index=True)
df_instructions = pd.concat(results_instructions.tolist(), ignore_index=True)

In [21]:
df = df.merge(df_ingredients, on='case_id', how='left')

In [22]:
# Identify potential overlapping columns (excluding the join key)
overlapping_cols_ingredients = list(set(df.columns) & set(df_ingredients.columns) - {'case_id'})
overlapping_cols_instructions = list(set(df.columns) & set(df_instructions.columns) - {'case_id'})

# Rename overlapping columns in df_ingredients
df_ingredients = df_ingredients.rename(columns={col: f"{col}_ingredient" for col in overlapping_cols_ingredients})

# Rename overlapping columns in df_instructions
df_instructions = df_instructions.rename(columns={col: f"{col}_instruction" for col in overlapping_cols_instructions})

# Perform the merges
final_df = df.merge(df_ingredients, on='case_id', how='left')
final_df = final_df.merge(df_instructions, on='case_id', how='left')

In [23]:
# Dropping duplicate columns

columns_to_drop = ['sentences_score', 'num_gt_sentences', 'points_per_sentence', 'num_llm_sentences', 'correct_sentences_score', 'extra_sentences_penalty', 'gt_not_in_llm_penalty', 'num_equal_sentences', 'matched_sentences_count', 'gt_not_in_llm_count', 'extra_sentences_count', 'gt_transition_sentence_count', 'llm_transition_sentence_count', 'gt_sentences', 'llm_sentences', 'matched_sentences', 'extra_sentences', 'gt_not_in_llm_sentences']

final_df.drop(columns=columns_to_drop, axis=1, inplace=True)

In [24]:
final_df['final_score'] = final_df['source_score'] + final_df['sentences_score_ingredient'] + final_df['sentences_score_instruction']

#### Edge case for "No Answer Available"

In [28]:
def no_answer_case(df, gt_col, llm_col, similarity_threshold=0.2):
    """
    Identifies and scores cases where both the ground truth and the LLM indicate
    an inability to answer a question.

    This function iterates through a DataFrame, comparing the answers in a ground
    truth column and an LLM output column. It checks if both answers express
    a lack of information using either semantic similarity to the phrase
    "No answer to this question" (above a specified threshold) or by containing
    predefined "no answer" keywords. If both conditions are met for a row,
    specific score columns ('source_score', 'sentences_score_ingredient',
    'sentences_score_instruction', 'final_score') in that row are set to predefined
    positive values, indicating a correctly identified "no answer" scenario.

    Args:
        df (pd.DataFrame): The DataFrame containing the ground truth and LLM answers.
        gt_col (str): The name of the column containing the ground truth answers.
        llm_col (str): The name of the column containing the LLM-generated answers.
        similarity_threshold (float, optional): The minimum cosine similarity
            (between 0 and 1) to the phrase "No answer to this question" for an
            answer to be considered a "no answer" case based on semantic similarity.

    Returns:
        pd.DataFrame: The input DataFrame with updated score values for rows where
                      both ground truth and LLM indicate an inability to answer.
                      Specific score columns are set to 1, 2, 2, and 5 respectively
                      for these "no answer" cases.

    Raises:
        KeyError: If the specified `gt_col` or `llm_col` are not found in the DataFrame.
        Exception: For any other unexpected error during row processing, an error
                   message is printed, and the row is skipped.
    """
    
    
    target_phrase = "No answer to this question"
    no_answer_keywords = ["no answer", "cannot answer", "not able to answer",
                           "not able to", "not found", "not available",
                           "no relevant information", "does not contain",
                           "unable to find"]

    for index, row in df.iterrows():
        try:
            gt_answer = str(row[gt_col]).strip().lower()
            llm_answer = str(row[llm_col]).strip().lower()

            gt_similarity = calculate_similarity(gt_answer, target_phrase)
            llm_similarity = calculate_similarity(llm_answer, target_phrase)

            gt_has_keyword = any(keyword in gt_answer for keyword in no_answer_keywords)
            llm_has_keyword = any(keyword in llm_answer for keyword in no_answer_keywords)

            # print(f"Index: {index}, GT Similarity: {gt_similarity:.4f}, LLM Similarity: {llm_similarity:.4f}, GT Keywords: {gt_has_keyword}, LLM Keywords: {llm_has_keyword}")

            if (gt_similarity >= similarity_threshold and llm_similarity >= similarity_threshold) or (gt_has_keyword and llm_has_keyword):
                df.loc[index, 'source_score'] = 1
                df.loc[index, 'sentences_score_ingredient'] = 2
                df.loc[index, 'sentences_score_instruction'] = 2
                df.loc[index, 'final_score'] = 5
        except KeyError as e:
            print(f"Error: Column '{e}' not found in the DataFrame for row {index}. Skipping row.")
        except Exception as e:
            print(f"An unexpected error occurred while processing row {index}: {e}")
    return df

### Final Result

In [31]:
no_answer_case(
        df=final_df,
        gt_col="ground_truth_final_answer",
        llm_col="gemini_final_answer_llm",
        similarity_threshold=0.5  # You can adjust the threshold if needed
    )

Unnamed: 0,case_id,query,ground_truth_rephrased_query,gemini_rephrased_query_llm,ground_truth_final_answer,gemini_final_answer_llm,gemini_alternative_answer,gemini_alternative_rephrased_query_llm,rephraser_semantic_similarity,gt_sources_extracted,...,gt_not_in_llm_count_instruction,extra_sentences_count_instruction,gt_transition_sentence_count_instruction,llm_transition_sentence_count_instruction,gt_sentences_instruction,llm_sentences_instruction,matched_sentences_instruction,extra_sentences_instruction,gt_not_in_llm_sentences_instruction,final_score
0,1,I have a lot of tofu and I want to make a Chin...,Sweet sour tofu bell pepper onion Chinese stir...,Sweet and sour tofu stir-fry with bell peppers...,Recipe 1: Mapo Tofu\n\nIngredients:\n\n½ cup o...,Recipe 1: (From Document 1). Best Dish: Mapo T...,"Okay, I understand. I will analyze the search ...",Tofu stir-fry recipe with sweet and sour sauce...,0.686584,[1],...,0,0,0,0,"On a medium heat, toast the chilies. Add ¼ cup...","1. On a medium heat, toast the chilies. Add ¼ ...","On a medium heat, toast the chilies. Add ¼ cup...",,,5.0
1,2,I have some dried shiitake mushrooms and I'd l...,Shiitake mushroom pork belly star anise braise...,"Shiitake mushroom, pork belly, star anise, soy...",Best Dish: Braised Pork and Egg with Rice\n\nR...,Recipe 1: Braised Pork and Egg with Rice\n\nIn...,Recipe 1: SHANGHAI BRAISED PORK BELLY\n\nIngre...,Chinese braised pork belly with shiitake mushr...,0.731325,[1],...,0,0,0,0,Rinse mushrooms and soak in hot water until so...,Rinse mushrooms and soak in hot water until so...,Rinse mushrooms and soak in hot water until so...,,,4.999
2,3,I'm looking for a Cantonese recipe that uses c...,Chicken feet black bean sauce steamed Cantones...,Cantonese steamed chicken feet with black bean...,No answer to this question,I am not able to answer this question. None of...,Based on your query for a Cantonese chicken fe...,"Chicken feet recipe, Cantonese, black bean sau...",0.921332,[],...,0,0,0,0,,,,,,5.0
3,4,I'm looking for a simple Chinese stir-fry reci...,Beef and broccoli. quick to make,Quick beef and broccoli stir-fry with light sa...,Recipe 1: Beef and Broccoli\n\nIngredients:\n\...,Recipe 1: Beef and Broccoli\n\nIngredients:\n\...,Recipe 1: Beef and Broccoli\n\nIngredients: 1¼...,"Beef and broccoli stir-fry recipe with light, ...",0.645821,[1],...,0,0,0,0,Place the water and cornstarch/cornflour into ...,Place the water and cornstarch/cornflour into ...,Place the water and cornstarch/cornflour into ...,,,5.0
4,5,I'm looking for an Indonesian dish with beef. ...,Spicy Indonesian dish with beef and ma la flavor.,Spicy Indonesian beef with ma la and dried chi...,Recipe 1: Beef Rendang\n\nIngredients:\n\nSpic...,Recipe 1: Beef Rendang\n\nIngredients:\n\nSpic...,"The customer is looking for a spicy, numbing I...","Spicy, numbing Indonesian beef dish with dried...",0.667477,[1],...,0,0,0,0,Puree Spice paste in a food processor until fi...,Puree Spice paste in a food processor until fi...,Puree Spice paste in a food processor until fi...,,,5.0
5,6,"I have some preserved duck eggs, and I'd like ...",Preserved duck egg ginger soy sauce cold Chine...,"Preserved duck egg appetizer, ginger, soy sauce.",Recipe 1: Silken Tofu with Century Egg\n\nIngr...,Recipe 1: Silken Tofu with Century Egg\n\nIngr...,Recipe 1: Silken Tofu with Century Egg\n\nIngr...,Preserved duck egg cold appetizer with ginger ...,0.545543,[1],...,0,0,0,0,Drain the tofu and arrange it on a plate with ...,Drain the tofu and arrange it on a plate with ...,Drain the tofu and arrange it on a plate with ...,,,5.0
6,7,I'm interested in a dish that uses stuffed egg...,Stuffed eggplant with ground pork and dried sh...,Savory stuffed eggplant with ground pork and d...,Recipe 1: Stir Fried Eggplant with Minced Pork...,Document 1 contains a recipe that closely matc...,Recipe 1: Stir fried eggplant with minced pork...,"Stuffed eggplant recipes with ground pork, sav...",0.690133,[1],...,0,0,0,0,"Mix sesame oil, corn flour, egg whites, shaoxi...","Mix sesame oil, corn flour, egg whites, shaoxi...","Mix sesame oil, corn flour, egg whites, shaoxi...",,,4.999
7,8,I'm looking for a vegetarian dish that's inspi...,Spicy Indian vegetarian soup with rice recipe,Spicy Indian vegetable soup with rice.,Recipe 1: Rasam\n\nIngredients:\n\nTamarind (s...,Recipe 1: Rasam\n\nIngredients:\n\nTamarind (s...,"The customer is looking for a spicy, Indian-in...",Spicy Indian-inspired vegetarian soup with veg...,0.731718,[1],...,0,0,0,1,Soak the tamarind in hot water for 30 mins the...,Soak the tamarind in hot water for 30 mins the...,Soak the tamarind in hot water for 30 mins the...,,,4.899
8,9,I have some fresh salmon and I want to make a ...,Japanese grilled salmon rice dish with cucumber,"Salmon and rice dish with cucumber, Japanese s...",Recipe 1: JAPANESE SALMON FLAKE ONIGIRI (RICE ...,Recipe 1: JAPANESE SALMON FLAKE ONIGIRI (RICE ...,Neither of the provided documents offers a rec...,"Salmon rice bowl with cucumber, Japanese style.",0.720132,[1],...,0,5,0,0,)\nInstructions\nSeason salmon with salt. Gril...,)\nInstructions\nSeason salmon with salt. Gril...,)\n\n)\n\nInstructions\n\nInstructions\n\nSeas...,Recipe 2 JAPANESE TEMAKI (CONE STYLE SUSHI)\n\...,,4.775
9,10,I'm trying to find a traditional Chinese recip...,Braised chicken wood ear mushroom ginger scall...,"Braised chicken, dark sauce, wood ear mushroom...","Recipe 1: Sweet potato, tea tree mushroom &\nc...","Recipe 1: Sweet potato, tea tree mushroom &\nc...","Recipe 1: Sweet potato, tea tree mushroom & ch...","Chicken recipe, dark sauce, braised or slow-co...",0.679397,[1],...,0,0,0,0,1. Clean the dried tea tree mushroom and soak ...,1. Clean the dried tea tree mushroom and soak ...,1. Clean the dried tea tree mushroom and soak ...,,,4.999


In [27]:
final_df.to_csv('chef-advisor-llm-results.csv', index=False)