In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Advanced Diarized Transcription and Domain Specific Summarization

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/diarized-transcription-summarization/diarized_transcription_and_summarization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) | [Anant Nawalgaria](https://github.com/anantnawal/), [Patrick Nestler](https://github.com/nestler/)| 

## Overview

Many businesses, particularly in the financial sector (but also other industries), struggle with accurately transcribing and summarizing multilingual audio recordings which could be several hours long. This challenge is especially critical for use cases that directly impact customer experience and financial outcomes.

Some key obstacles include:

- **Hallucinations**: AI models sometimes generate incorrect or nonsensical information.
- **Numerical Inaccuracies**: Precise transcription of numbers is crucial in finance, and errors can have serious consequences.
- **Speaker Misidentification for multilingual conversations**: Accurately attributing dialogue in multi-speaker settings, especially with more than two participants in multilingual settings, can be complex.
- **Summarization Deficiencies**: Generating concise summaries tailored to the specific financial domain is essential for efficient analysis.
- **Recordings which are several hours long**: Ensuring diarization and transcription can be both effectively and efficiently performed on long recordings.

This notebook demonstrates sample code for a semi-agentic, multimodal [solution developed for Commerzbank](https://cloud.google.com/blog/products/ai-machine-learning/how-commerzbank-is-transforming-financial-advisory-workflows-with-gen-ai).  A more advanced version of this solution is currently deployed in production, delivering substantial productivity gains.

### Objectives

In this tutorial, you will learn how to do build an advanced diarized transcription and domain/task specific summarization using the multimodal capabilities of Gemini on Vertex AI together with the Gen AI Evaluation Service API in Vertex AI service for Python.
You will complete the following tasks:

- Install the Vertex AI SDK for Python
- Chunk an audio file into segments of pre specified durations
- Use the Gemini on Vertex AI to interact with the audio files
  - Gemini 1.5 Pro (`gemini-1.5-pro`) model:
    - to use few-shot multimodal prompting combined with specific task specific instructions to perform sequential diarized transcription of all the contiguous audio chunks, using the output generated at each step as input for the next one
    - extract task/domain specific facts and figures 
    - generate multiple task specific summaries from the extracted facts and diarized transcript
    - select the best summary for each task, using pointwise and pairwise evaluations using the Gen AI Evaluation Service.

## Get started

### Install Vertex AI SDK and other required packages


In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [1]:
import functools
from functools import partial
import uuid

from google.cloud import aiplatform, storage
import nest_asyncio
import pandas as pd
from pydub import AudioSegment
from pydub.utils import make_chunks
from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part

nest_asyncio.apply()  # @param {type:"string", isTemplate: true}

## Load Helper Functions
In this section we are going to define the various functions involved in developing an advanced diarized transcription and domain/task specific summarization system.

### Step 1. Audio chunking.
In this step we chunk the audio into smaller chunks of predefined durations, in order to allow transcriptions of large audio files (lasting several hours) which go beyond the size of the output window of the Gemini 1.5 model.  (which was `8192` at the time of publication).

In [None]:
storage_client = storage.Client()


def chunk_audio(
    audio_url: str,
    output_bucket: str,
    output_format: str = "wav",
    chunk_length: int = 600000,
) -> list[str]:
    """Splits a audio file in GCS into chunks and stores them back in GCS.

    Args:
        audio_url (str): the GCS url of the audio file to be chunked.
        output_bucket (str):the GCS bucket the chunked audio files would be stored.
        chunk_length_ms (int): Desired chunk length in milliseconds (default: 10
          minutes).
        output_format (str): Output audio format (default: wav).
    """
    bucket_name = audio_url.split("/")[2]

    input_blob_name = "/".join(audio_url.split("/")[3:])

    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(input_blob_name)

    # Download the FLAC file to a temporary local file
    with open("temp_audio." + output_format, "wb") as temp_file:
        storage_client.download_blob_to_file(blob, temp_file)

    url_audio_chunks = list()
    # Split the audio using pydub
    audio = AudioSegment.from_file("temp_audio." + output_format, format=output_format)
    chunks = make_chunks(audio, chunk_length)

    bucket = storage_client.bucket(output_bucket)
    for i, chunk in enumerate(chunks):
        chunk_name_local = (
            f"{input_blob_name.split('/')[-1].split('.')[0]}_part{i+1}.{output_format}"
        )
        chunk.export(chunk_name_local, format=output_format)

        chunk_name_gcs = f"{input_blob_name.split('.')[0]}_part{i+1}.{output_format}"

        # Upload the split chunk back to GCS
        blob = bucket.blob(chunk_name_gcs)
        blob.upload_from_filename(chunk_name_local)
        url_audio_chunks.append(f"gs://{output_bucket}/{chunk_name_gcs}")

    return url_audio_chunks

### Step 2. Advanced diarized transcription
This process focuses on creating a highly accurate transcript from audio data using Gemini 1.5 Pro. Here's the breakdown:

- Diarization: The audio is broken into chunks and processed sequentially. Gemini 1.5 Pro identifies the speakers in each chunk and attributes their words correctly.
- Contextual Processing: To ensure accuracy, the model receives the transcript generated up to the current chunk, along with carefully engineered prompts and a few-shot example. This helps maintain consistency and correctly identify speakers, especially with crucial numerical information.
- Cleanup: After the full transcript is created, the individual audio chunks are deleted to save storage space in the cleanup phase.

Essentially, this is a careful, step-by-step process designed to produce a structured, accurate transcript that preserves speaker identification and important details like numbers.


In [None]:
def diarize_transcribe(
    url_audio_chunks: list[str],
    consider_previous: bool = True,
    quick: bool = False,
    file_format: str = "audio/wav",
    path_example_transcription: str = "gs://",
    path_example_audio: str = "gs://",
) -> str:
    """Take the GCS urls of contiguous chunked audio files and the example transcription of audio files and
    do advanced diarized transcription
    """

    bucket = storage_client.bucket(path_example_transcription.split("/")[2])
    blob = bucket.blob("/".join(path_example_transcription.split("/")[3:]))
    ex_output_transcription = blob.download_as_string().decode("utf-8")

    generation_model_audio = GenerativeModel(
        "gemini-1.5-pro-002".replace("pro", "flash" if quick else "pro")
    )
    generation_config_audio = GenerationConfig(
        temperature=0.0,
        max_output_tokens=8192,
        candidate_count=1,
    )
    prompts_transcription_initial_p1 = """
      Transcribe the referenced audio file. The file contains a recording of an advisory call between one or more bank advisor(s) and their customer.
      Differentiate and diarize the speakers in the call clearly. Do not add time marks.
      Ensure correctness and consistency in speaker names while attributing statements during diarization.
      Ensure that spoken numbers and numerical figures are transcribed correctly and precisely.
      Some calls can also contain segments of conversations just between banking advisors, sure the diarization correctly reflects that.
      In case the call contains conversation segments between banking advisors might contain reference to internal systems, make sure this is properly transcribed.
      Here is an example of an input audio file and its corresponding transcription.
      Example Input Audio File:
      """
    prompts_transcription_initial_p2 = """
      transcription of the example audio file:
      {ex_output_transcription}
      Here is the referenced audio file to transcribe keeping the instructions above in mind:
      """
    prompts_transcription_subsequent_p1 = """
      The given file is a contiguous audio chunk, containing continuation of a recording of an advisory call between one or more bank advisor(s) and their customer.
      Differentiate and diarize the speakers in the call clearly. 
      Transcribe the referenced audio file, continuing on from the given transcription of the previous part , while also keeping speaker names consistent. Do not add time marks.
      Ensure correctness and consistency in speaker names while attributing statements during diarization.
      Ensure that spoken numbers and numerical figures are transcribed correctly and precisely.
      Some calls can also contain segments of conversations just between banking advisors, sure the diarization correctly reflects that.
      In case the call contains conversation segments between banking advisors might contain reference to internal systems, make sure this is properly transcribed.
      Here is an example of an input audio file and its corresponding transcription.
      Example Input Audio File:
      """
    prompts_transcription_subsequent_p2 = """
      transcription of the example audio file:
      {ex_output_transcription}    
      Transcription so far of previous audio chunks:
      {transcription}
      Here is the referenced audio file to transcribe keeping the instructions above in mind:
      """
    generated_transcription = ""
    for i in range(len(url_audio_chunks)):

        if i == 0:
            prompt = prompts_transcription_initial_p1
            prompt_2 = prompts_transcription_initial_p2.format(
                ex_output_transcription=ex_output_transcription
            )
        else:

            prompt = (
                prompts_transcription_subsequent_p1
                if consider_previous
                else prompts_transcription_initial_p1
            )
            prompt_2 = (
                prompts_transcription_subsequent_p2.format(
                    ex_output_transcription=ex_output_transcription,
                    transcription=generated_transcription,
                )
                if consider_previous
                else prompts_transcription_initial_p2.format(
                    ex_output_transcription=ex_output_transcription
                )
            )
        audio_url = url_audio_chunks[i]
        audio_file = Part.from_uri(audio_url, mime_type=file_format)
        ex_audio_file = Part.from_uri(path_example_audio, mime_type=file_format)
        contents = [prompt, ex_audio_file, prompt_2, audio_file]
        response = generation_model_audio.generate_content(
            contents=contents, generation_config=generation_config_audio
        )
        generated_transcription += response.text

    return generated_transcription

### Step 3 and 4: Fact extraction & Summary generation
Step 3 involves identifying key information related to the specific task/dsocument ( in this example financial advisory document) that needs to be completed. The model is prompted to recognize and extract crucial details such as client names, investment preferences, risk tolerance, and financial goals.

Step 4 then focuses on generating concise and accurate summaries for each field within the document. Leveraging the extracted facts from the previous step and employing Zero-shot Chain-of-Thought (CoT) prompting, Gemini creates multiple  summaries tailored to the specific domain and the requirements of each form field. This ensures the generated summaries are not only informative but also comply any internal guidelines requirements.


In [None]:
def extract_facts(generated_transcription: str, quick: bool = True) -> str:
    """
    Take the generated transcript and extract key domain/task-specific facts for downstream processing
    """
    generation_model_facts = GenerativeModel(
        "gemini-1.5-pro-002".replace("pro", "flash" if quick else "pro")
    )
    generation_config_facts = GenerationConfig(
        temperature=0.0,
        max_output_tokens=512,
    )

    prompt = """You are an experienced bank advisor who provides consultations to clients regarding financial products. You conducted a telephone consultation with a client, during which you discussed the client's need for one or more financial products. This consultation was recorded. A consultation can span multiple individual conversations. The transcription or transcriptions of the recording can be found below.
    Your task is to extract information from the content of the consultation that you conducted with your client. Combine the content of multiple transcripts. Use the field definition provided below for the extraction of fields. The field definition consists of the field name and a field description for each field.

    Reason for Consultation: Describes the reason for the consultation, i.e., the reason why the consultation was conducted.

    Details of the Consultation Reason: Details of the client's needs. The following content should be mentioned in particular, if they were discussed in the conversation: financial needs, the scope of a basic transaction, interest rates, and desired conditions.

    Current Market Expectation and Market Forecast of the Client (if available): Information on the market expectation and expected changes in the market that the client has in relation to the reason for the consultation.

    Investment/Financing Horizon of the Client: Period for which the financial need exists or an investment is to be made.

    Priorities and Goals of the Client: Expected benefits for the client and priorities of the client. Also includes any exclusion criteria for certain products mentioned by the client.

    Existing Knowledge and Experience of the Client: Existing knowledge and experience of the client in relation to the reason for the consultation.

    Risk Profile of the Client: What risks the client is willing to take.

    Existing Products: Financial products that the client already uses at Commerzbank or other financial institutions in connection with the reason for the consultation.

    Products Discussed: Products that were discussed as candidates for a recommendation to meet the client's needs.

    Recommended Product: Financial product or products recommended to the client. Explicit mention of the product name/product designation.

    Reason for the Recommended Product: Recommendation of the advisor regarding the use of the product. Suitability of the product for the investment objectives, the investment horizon, and the risk profile of the client.

    Functionality of the Proposed Product: Describes the functionality of the proposed product.

    Product Advantages: The advantages of the presented product. If several products were discussed, comparison of the advantages of the recommended product to these products.

    Risk Disclosure: The risks and disadvantages of the presented product. If several products were discussed, comparison of the risks and disadvantages of the recommended product to these products.

    Product-Specific Features: Important additional information on the product and its functionality that was explained to the client. In particular, this includes mandatory disclosures such as bail-in, fee-based advice, default risk, or a negative market value.

    Higher Costs of Structured Products Compared to Plain Vanilla Products: Mentioning and naming the higher costs of structured products compared to plain vanilla products.

    Initially Negative Market Value for OTC Products: Initially negative market value for structured OTC products, with the exception of purchased options.

    Client's Feedback on the Recommendation: Client's feedback on the recommended product and further steps.

    Objection Handling: Client's objections and the advisor's response to these objections.

    Client Questions: Questions clearly attributable to the client regarding the recommendation made and the answer to these questions.

    Agreement to Receive the Documents after the Trade: Client's consent to receive the documents used in connection with the consultation only after the trade or the conclusion of the transaction.

    Client's Prior Knowledge of the Recommended Product: Describes the client's prior knowledge in the specific context of the recommended product.

    Always consider the following guidelines for extracting information:
    Do not add any statements or comments to the content of the recording.
    Verify facts contained in the document, especially product names, numerical values, dates, and sums, within the context of the entire transcript.
    Ensure that details from the conversation such as product names, sums, and interest rates are included in the extracted information.
    Retain English terms, particularly the names of financial products.
    Justify your answers with facts from the provided input and refer specifically only to sources from the audio call.
    Ensure that statements are correctly attributed to either the bank or the client. Do not include source references or timestamps.
    Use the term 'The Client' instead of the client's name.
    Provide the information grammatically correct and in German.
    Think step-by-step and check whether the results meet the above tasks.
    Here is the transcript of the telephone call or the transcripts of the telephone calls. Iterate over the transcript multiple times to increase confidence in extracting all information correctly:
    {transcription}
    """.format(
        transcription=generated_transcription
    )
    generated_facts = generation_model_facts.generate_content(
        contents=prompt,
        generation_config=generation_config_facts,
    ).text
    return generated_facts


def generate_summaries(
    context: str, quick: bool = False, num_summaries: int = 3
) -> list[str]:
    """
    Take the generated transcript extracted facts and generate one or more task and domain specific summaries.
    """
    prompt_instr = """
    You are an experienced bank advisor providing consultations to clients on banking products. Create a 5-sentence summary. Ensure that details from the conversation, like sums and interest rates, are included in the summary.
    Your inputs are the transcription of the conversation and some important information extracted from this transcription, provided below. The relevant fields for you to consider are the following:
    - Reason for consultation:
    - Details of the reason for consultation:
    - Existing products:
    Do not mention the client's name, refrain from using a salutation.
    Formulate the summary as continuous text and use varied sentence beginnings. Formulate the answer as a direct customer address. Start the summary with the phrase 'The reason for our conversation was'. Address the customer in the following sentences with 'you'.
    Provide the information grammatically correctly and in English.
    Avoid translating product names into English. Make sure that all product names are pronounced as they were mentioned in the conversation.
    Avoid abbreviations. Spell out the entire word.
    Base your answer on facts from the provided inputs and refer only to sources from the available fields.
    Ensure that statements from the client and the advisor are clearly distinguished.
    Think step-by-step and check if the results meet the above tasks.
    """
    generation_model_summary = GenerativeModel(
        "gemini-1.5-pro-002".replace("pro", "flash" if quick else "pro")
    )
    generation_config_summary = GenerationConfig(
        temperature=0.0, max_output_tokens=512, candidate_count=num_summaries
    )
    prompt = """{instructions}
    {context}
    """.format(
        instructions=prompt_instr, context=context
    )

    generated_summaries = [
        candidate.text
        for candidate in generation_model_summary.generate_content(
            contents=prompt,
            generation_config=generation_config_summary,
        ).candidates
    ]
    return prompt_instr, generated_summaries

### Step 5: Summary optimization by selecting the best one
To ensure the highest quality output, the multiple summaries generated for each form field are [evaluated and the best summary for each field is selected](https://cloud.google.com/blog/products/ai-machine-learning/enhancing-llm-quality-and-interpretability-with-the-vertex-gen-ai-evaluation-service?e=48754805) using the Vertex AI Gen AI Evaluation Service. Importantly, the service also provides a human-readable explanation for its selection, enabling sales advisors to understand the reasoning behind the AI's choices and maintain trust in the automated process.

In [None]:
experiment_name = "summarization-quality"


def pointwise_eval(
    instruction: str,
    context: str,
    responses: list[str],
    experiment_name: str = experiment_name,
    eval_metrics: list[MetricPromptTemplateExamples.Pointwise] = [
        MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY,
        MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
    ],
) -> vertexai.evaluation._base.EvalResult:
    """
    Takes the instruction, context and a variable number of corresponding generated responses, and returns the pointwise evaluation metrics
    for each of the provided metrics. For this example the metrics are Q & A related, however the full list can be found on the website:
    https://cloud.google.com/vertex-ai/generative-ai/docs/models/online-pipeline-services
    """

    instructions = [instruction] * len(responses)

    contexts = [context] * len(responses)

    eval_dataset = pd.DataFrame(
        {
            "instruction": instructions,
            "context": contexts,
            "response": responses,
        }
    )

    eval_task = EvalTask(
        dataset=eval_dataset,
        metrics=eval_metrics,
        experiment=experiment_name,
    )
    results = eval_task.evaluate(
        prompt_template="{instruction} \n {context}",
        experiment_run_name="gemini-summ-pointwise-" + str(uuid.uuid4()),
    )

    return results


def pairwise_greater(
    instructions: list,
    context: str,
    project_id: str,
    location: str,
    experiment_name: str,
    baseline: str,
    candidate: str,
) -> tuple[str, str]:
    """
    Takes Instructions, Context and two different responses.
    Returns the response which best matches the instructions/Context for the given
    quality metric ( in this case question answering).
    More details on the web API and different quality metrics which this function
    can be extended to can be found on
    https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/evaluation
    """
    eval_dataset = pd.DataFrame(
        {
            "instruction": [instructions],
            "context": [context],
            "response": [candidate],
            "baseline_model_response": [baseline],
        }
    )

    eval_task = EvalTask(
        dataset=eval_dataset,
        metrics=[
            MetricPromptTemplateExamples.Pairwise.SUMMARIZATION_QUALITY,
        ],
        experiment=experiment_name,
    )
    results = eval_task.evaluate(
        prompt_template="{instruction} \n {context}",
        experiment_run_name="gemini-summ-pairwise-" + str(uuid.uuid4()),
    )
    result = results.metrics_table[
        [
            "pairwise_summarization_quality/pairwise_choice",
            "pairwise_summarization_quality/explanation",
        ]
    ].to_dict("records")[0]
    choice = (
        baseline
        if result["pairwise_summarization_quality/pairwise_choice"] == "BASELINE"
        else candidate
    )
    return (choice, result["pairwise_summarization_quality/explanation"])


def greater(cmp: callable, a: str, b: str) -> int:
    """
    A comparison function which takes the comparison function, and two variables as input
    and returns the one which is greater according to the logic defined inside the cmp function.
    """
    choice, explanation = cmp(a, b)

    if choice == a:
        return 1
    return -1


def select_best_response(instruction, context, responses) -> tuple[str, dict]:
    """Takes the instruction, context and a variable number of responses as input, and returns the best performing response as well as its associated

    human readable pointwise quality metrics for the configured criteria in the
    above functions.
    The process consists of two steps:
    1. Selecting the best response by using Pairwise comparisons between the
    responses for the user specified metric ( e.g. Q & A)
    2. Doing pointwise evaluation of the best response and returning human
    readable quality metrics and explanation along with the best response.
    """
    cmp_f = partial(
        pairwise_greater, instruction, context, PROJECT_ID, LOCATION, experiment_name
    )
    cmp_greater = partial(greater, cmp_f)

    pairwise_best_response = max(responses, key=functools.cmp_to_key(cmp_greater))
    pointwise_metric = pointwise_eval(
        instruction, context, [pairwise_best_response], experiment_name
    )
    qa_metrics = pointwise_metric.metrics_table[
        [
            col
            for col in pointwise_metric.metrics_table.columns
            if ("summarization" in col) or ("groundedness" in col)
        ]
    ].to_dict("records")[0]
    return pairwise_best_response, qa_metrics

We put all the steps together in one final step

## End-to-end execution for diarized transcription and summarization

In this step you will run all the steps mentioned in sequence on a toy audio file.
- First we do step 1 and 2: to chunk the audio file into smaller contiguous chunks, and then sequentially iterate over them to do advanced multimodal task/domain-specific diarized transcription. 
- Then step 3 to extract facts is performed.

In [None]:
use_facts_for_summaries: bool = True
# enter the bucket to store temporary chunked audio files
output_bucket: str = "gs://your_output_bucket"
audio_url = (
    "gs://github-repo/use-cases/diarization-transcription-summarization/rec1.wav"
)
url_audio_chunks = chunk_audio(audio_url, output_bucket)
print(url_audio_chunks)
print("Finished audio chunking")
generated_transcription = diarize_transcribe(
    url_audio_chunks,
    path_example_transcription="gs://github-repo/use-cases/diarization-transcription-summarization/transcript_example.txt",
    path_example_audio="gs://github-repo/use-cases/diarization-transcription-summarization/rec2.wav",
)
print("Finished transcription")
generated_facts = extract_facts(generated_transcription)
print("Finished fact generation")

In [None]:
print(generated_transcription)

In [None]:
print(generated_facts)

Then step 4 and 5 to generate the optimized summaries based on the transcript and facts generated.

In [None]:
context_summary = (
    """Transcript:
{transcription}
Excerpt from the transcript containing facts:
{facts}
""".format(
        transcription=generated_transcription, facts=generated_facts
    )
    if use_facts_for_summaries
    else """Transcript:
    {transcription}
    """.format(
        transcription=generated_transcription
    )
)

summary_prompt, generated_summaries = generate_summaries(context_summary, 3)
best_summary = select_best_response(
    summary_prompt, context_summary, generated_summaries
)
print(best_summary)

## Cleanup
In this step we delete the temporary files and experiments generated during this lab.

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION)
experiment = aiplatform.Experiment(experiment_name)
experiment.delete()
for url in url_audio_chunks:
    bucket_name = url.replace("gs://", "").split("/")[0]
    object_name = "/".join(url.replace("gs://", "").split("/")[1:])

    # Get the bucket and delete the object
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(object_name)