# Task 1

In [49]:
%pip install --upgrade --quiet google-genai nest-asyncio==1.5.9

In [50]:
import pandas as pd
from inspect import cleandoc
from IPython.display import display, Markdown

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
)

pd.set_option("display.max_colwidth", None)

In [51]:
PROJECT_ID = "qwiklabs-gcp-02-8875a02eb56d"  # @param {type:"string"}
# LOCATION = "us-west1" # @param {type:"string"}
LOCATION = "us-central1" # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Task 2

In [96]:
hourly_rates = cleandoc("""
  Screenwriter: $40
  Actor: $25
  Director: $30
  Camera Operator: $35
  Sound Engineer: $20
  Editor: $30
  """)

planning_notes = cleandoc("""
 Phases of Production:
   Writing:
   The Screenwriter will write the script.
   They need 72 hours to do so.


   Pre-Production:
   The Director needs time to analyze the script.
   They will work on it for 36 hours.
   The Camera Operator will join the director for 24 hours of planning.


   Production Phase 1
   The first three days of filming will require the director, 4 actors, the camera operator, and the sound engineer


   Production Phase 2
   The next three days of filming will require the director, 8 actors, the camera operator, and the sound engineer


   Post-Production
   The editor will take 64 hours to edit the film.
   The director will work with the editor for 24 hours during this phase.
""")

tasks = [
    """What is the cost of each phase of production?
    If days are mentioned, assume an 8 hour work day.""",

    """How many days will each phase require? Assume an
    8 hour work day. If multiple people are working in parallel,
    do not add those times together, but only use the longest time.
    Also include a count of the total number of days of the entire
    project.""",

    """Prepare a text schedule for all phases of the film starting
    on Feb 3, 2025. The whole crew should be off Saturdays
    and Sundays."""
]

prompt_template = cleandoc("""
  <instructions>
  Prepare a document to fulfill the task based on the context provided.
  </instructions>
<task>
  {task}
  </task>
<context>
  {context}
  </context>
  """)

In [97]:
context = hourly_rates + "\n\n" + planning_notes

In [98]:
llm_pro = GenerativeModel(
  "gemini-2.5-pro-preview-05-06",
  generation_config={
      "temperature": 0,
  },
)

llm_flash = GenerativeModel(
  "gemini-2.0-flash-001",
  generation_config={
      "temperature": 0,
  },
)

In [99]:
prompt = prompt_template.format(task=tasks[1], context=context)

In [100]:
response_pro = llm_pro.generate_content(prompt)
response_flash = llm_flash.generate_content(prompt)

In [101]:
display(Markdown("# Gemini Pro Response\n\n" + response_pro.text))
display(Markdown("# Gemini Flash Response\n\n" + response_flash.text))

# Gemini Pro Response

Okay, here's the breakdown of days required for each phase, assuming an 8-hour workday and using the longest time for parallel work:

**Phase Durations:**

*   **Writing:**
    *   Screenwriter: 72 hours
    *   Days: 72 hours / 8 hours/day = **9 days**

*   **Pre-Production:**
    *   Director: 36 hours
    *   Camera Operator (with Director): 24 hours
    *   The longest duration is the Director's.
    *   Days: 36 hours / 8 hours/day = **4.5 days**

*   **Production Phase 1:**
    *   The phase is explicitly stated as "the first three days of filming."
    *   Days: **3 days**

*   **Production Phase 2:**
    *   The phase is explicitly stated as "the next three days of filming."
    *   Days: **3 days**

*   **Post-Production:**
    *   Editor: 64 hours
    *   Director (with Editor): 24 hours
    *   The longest duration is the Editor's.
    *   Days: 64 hours / 8 hours/day = **8 days**

**Summary of Days per Phase:**

*   Writing: **9 days**
*   Pre-Production: **4.5 days**
*   Production Phase 1: **3 days**
*   Production Phase 2: **3 days**
*   Post-Production: **8 days**

**Total Number of Days for the Entire Project:**

Total Days = 9 (Writing) + 4.5 (Pre-Production) + 3 (Production 1) + 3 (Production 2) + 8 (Post-Production)
Total Days = **27.5 days**

# Gemini Flash Response

**Phase Durations (8-hour workday):**

*   **Writing:** 72 hours / 8 hours/day = 9 days (Screenwriter)
*   **Pre-Production:**
    *   Director: 36 hours / 8 hours/day = 4.5 days
    *   Camera Operator: 24 hours / 8 hours/day = 3 days
    *   Longest time: 4.5 days
*   **Production Phase 1:** 3 days (Director, Actors, Camera Operator, Sound Engineer)
*   **Production Phase 2:** 3 days (Director, Actors, Camera Operator, Sound Engineer)
*   **Post-Production:**
    *   Editor: 64 hours / 8 hours/day = 8 days
    *   Director: 24 hours / 8 hours/day = 3 days
    *   Longest time: 8 days

**Total Project Duration:**

9 days (Writing) + 4.5 days (Pre-Production) + 3 days (Production Phase 1) + 3 days (Production Phase 2) + 8 days (Post-Production) = **27.5 days**


# Task 3

In [57]:
eval_dataset = pd.DataFrame([{
    "prompt": prompt,
    "response": response_flash.text,
    "baseline_model_response": response_pro.text
}])

In [58]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[MetricPromptTemplateExamples.Pairwise.QUESTION_ANSWERING_QUALITY],
    experiment="indie-film-planning"
)

# Task 4

In [59]:
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
)

MetricPromptTemplateExamples.list_example_metric_names()

['coherence',
 'fluency',
 'safety',
 'groundedness',
 'instruction_following',
 'verbosity',
 'text_quality',
 'summarization_quality',
 'question_answering_quality',
 'multi_turn_chat_quality',
 'multi_turn_safety',
 'pairwise_coherence',
 'pairwise_fluency',
 'pairwise_safety',
 'pairwise_groundedness',
 'pairwise_instruction_following',
 'pairwise_verbosity',
 'pairwise_text_quality',
 'pairwise_summarization_quality',
 'pairwise_question_answering_quality',
 'pairwise_multi_turn_chat_quality',
 'pairwise_multi_turn_safety']

In [61]:
import datetime

run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

eval_result = eval_task.evaluate(
    prompt_template="{prompt}",
    experiment_run_name=f"indie-film-planning-eval-{run_ts}"
)

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 1 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 1/1 [00:00<00:00,  1.48it/s]
INFO:vertexai.evaluation._evaluation:All 1 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.6851342499999191 seconds


In [72]:
eval_results_to_compare = []
eval_results_to_compare.append(eval_result)

In [71]:
display(Markdown("# Summary Metrics:\n\n"))
display(eval_result.summary_metrics)

display(Markdown("\n\n# Metrics_table:"))
display(eval_result.metrics_table)

display(Markdown("\n\n# Model Choice:"))
display(eval_result.metrics_table["pairwise_question_answering_quality/pairwise_choice"])

display(Markdown("\n\n# Model Choice Explanation:"))
display(eval_result.metrics_table["pairwise_question_answering_quality/explanation"])

# Summary Metrics:



{'row_count': 1,
 'pairwise_question_answering_quality/candidate_model_win_rate': np.float64(0.0),
 'pairwise_question_answering_quality/baseline_model_win_rate': np.float64(0.0)}



# Metrics_table:

Unnamed: 0,prompt,response,baseline_model_response,pairwise_question_answering_quality/explanation,pairwise_question_answering_quality/pairwise_choice
0,"<instructions>\n Prepare a document to fulfill the task based on the context provided.\n </instructions>\n<task>\n How many days will each phase require? Assume an\n 8 hour work day. If multiple people are working in parallel,\n do not add those times together, but only use the longest time.\n Also include a count of the total number of days of the entire\n project.\n </task>\n<context>\n Screenwriter: $40\nActor: $25\nDirector: $30\nCamera Operator: $35\nSound Engineer: $20\nEditor: $30\n\nPhases of Production:\n Writing:\n The Screenwriter will write the script.\n They need 72 hours to do so.\n\n\n Pre-Production:\n The Director needs time to analyze the script.\n They will work on it for 36 hours.\n The Camera Operator will join the director for 24 hours of planning.\n\n\n Production Phase 1\n The first three days of filming will require the director, 4 actors, the camera operator, and the sound engineer\n\n\n Production Phase 2\n The next three days of filming will require the director, 8 actors, the camera operator, and the sound engineer\n\n\n Post-Production\n The editor will take 64 hours to edit the film.\n The director will work with the editor for 24 hours during this phase.\n </context>\n","**Phase Durations (8-hour workday):**\n\n* **Writing:** 72 hours / 8 hours/day = 9 days (Screenwriter)\n* **Pre-Production:**\n * Director: 36 hours / 8 hours/day = 4.5 days\n * Camera Operator: 24 hours / 8 hours/day = 3 days\n * Longest time: 4.5 days\n* **Production Phase 1:** 3 days (Director, Actors, Camera Operator, Sound Engineer)\n* **Production Phase 2:** 3 days (Director, Actors, Camera Operator, Sound Engineer)\n* **Post-Production:**\n * Editor: 64 hours / 8 hours/day = 8 days\n * Director: 24 hours / 8 hours/day = 3 days\n * Longest time: 8 days\n\n**Total Project Duration:**\n\n9 days (Writing) + 4.5 days (Pre-Production) + 3 days (Production Phase 1) + 3 days (Production Phase 2) + 8 days (Post-Production) = **27.5 days**\n","Okay, here's the breakdown of days required for each phase, assuming an 8-hour workday and using the longest time for parallel work:\n\n**Phase Durations:**\n\n* **Writing:**\n * Screenwriter: 72 hours\n * Days: 72 hours / 8 hours/day = **9 days**\n\n* **Pre-Production:**\n * Director: 36 hours\n * Camera Operator (with Director): 24 hours\n * The longest duration is the Director's.\n * Days: 36 hours / 8 hours/day = **4.5 days**\n\n* **Production Phase 1:**\n * The phase is explicitly stated as ""the first three days of filming.""\n * Days: **3 days**\n\n* **Production Phase 2:**\n * The phase is explicitly stated as ""the next three days of filming.""\n * Days: **3 days**\n\n* **Post-Production:**\n * Editor: 64 hours\n * Director (with Editor): 24 hours\n * The longest duration is the Editor's.\n * Days: 64 hours / 8 hours/day = **8 days**\n\n**Summary of Days per Phase:**\n\n* Writing: **9 days**\n* Pre-Production: **4.5 days**\n* Production Phase 1: **3 days**\n* Production Phase 2: **3 days**\n* Post-Production: **8 days**\n\n**Total Number of Days for the Entire Project:**\n\nTotal Days = 9 (Writing) + 4.5 (Pre-Production) + 3 (Production 1) + 3 (Production 2) + 8 (Post-Production)\nTotal Days = **27.5 days**","Both responses answer the question, include all of the phases and the total, and give correct numbers.",TIE




# Model Choice:

Unnamed: 0,pairwise_question_answering_quality/pairwise_choice
0,TIE




# Model Choice Explanation:

Unnamed: 0,pairwise_question_answering_quality/explanation
0,"Both responses answer the question, include all of the phases and the total, and give correct numbers."


# Redo from Task 3

In [77]:
eval_dataset = pd.DataFrame([{
    "prompt": prompt
}])

In [78]:
pairwise_metric = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_pro,
    metric_prompt_template=PairwiseMetricPromptTemplate(
        criteria={
            "overall_quality": "Which response better answers the question based on accuracy, clarity, and completeness?"
        },
        rating_rubric={
            "A": "Candidate",
            "B": "Baseline",
            "tie": "Equally good"
        }
    )
)

INFO:vertexai.evaluation.metrics.metric_prompt_template:The `input_variables` parameter is empty. Only the `response` and `baseline_model_response` columns are used for computing this model-based metric.


In [79]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[pairwise_metric],
    experiment="indie-film-planning"
)

In [80]:
import datetime

run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

eval_result = eval_task.evaluate(
    model=llm_flash,
    prompt_template="{prompt}",
    experiment_run_name=f"film-eval-{run_ts}"
)

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.0-flash-001', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.0-flash-001.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 1.5875181400001566 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:12<00:00, 12.20s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully gener

In [81]:
eval_results_to_compare = []
eval_results_to_compare.append(eval_result)

from IPython.display import display, Markdown

display(Markdown("# 📊 Summary Metrics"))
display(eval_result.summary_metrics)

display(Markdown("## 🧮 Full Metrics Table"))
display(eval_result.metrics_table)

display(Markdown("## 🏆 Preferred Model"))
display(eval_result.metrics_table["question_answering_quality/pairwise_choice"])

display(Markdown("## 📝 Explanation"))
display(eval_result.metrics_table["question_answering_quality/explanation"])

# 📊 Summary Metrics

{'row_count': 1,
 'question_answering_quality/candidate_model_win_rate': np.float64(0.0),
 'question_answering_quality/baseline_model_win_rate': np.float64(0.0)}

## 🧮 Full Metrics Table

Unnamed: 0,prompt,response,baseline_model_response,question_answering_quality/explanation,question_answering_quality/pairwise_choice
0,"<instructions>\n Prepare a document to fulfill the task based on the context provided.\n </instructions>\n<task>\n How many days will each phase require? Assume an\n 8 hour work day. If multiple people are working in parallel,\n do not add those times together, but only use the longest time.\n Also include a count of the total number of days of the entire\n project.\n </task>\n<context>\n Screenwriter: $40\nActor: $25\nDirector: $30\nCamera Operator: $35\nSound Engineer: $20\nEditor: $30\n\nPhases of Production:\n Writing:\n The Screenwriter will write the script.\n They need 72 hours to do so.\n\n\n Pre-Production:\n The Director needs time to analyze the script.\n They will work on it for 36 hours.\n The Camera Operator will join the director for 24 hours of planning.\n\n\n Production Phase 1\n The first three days of filming will require the director, 4 actors, the camera operator, and the sound engineer\n\n\n Production Phase 2\n The next three days of filming will require the director, 8 actors, the camera operator, and the sound engineer\n\n\n Post-Production\n The editor will take 64 hours to edit the film.\n The director will work with the editor for 24 hours during this phase.\n </context>\n","**Phase Durations (8-hour workday):**\n\n* **Writing:** 72 hours / 8 hours/day = 9 days (Screenwriter)\n* **Pre-Production:**\n * Director: 36 hours / 8 hours/day = 4.5 days\n * Camera Operator: 24 hours / 8 hours/day = 3 days\n * Longest time: 4.5 days\n* **Production Phase 1:** 3 days (Director, Actors, Camera Operator, Sound Engineer)\n* **Production Phase 2:** 3 days (Director, Actors, Camera Operator, Sound Engineer)\n* **Post-Production:**\n * Editor: 64 hours / 8 hours/day = 8 days\n * Director: 24 hours / 8 hours/day = 3 days\n * Longest time: 8 days\n\n**Total Project Duration:**\n\n9 days (Writing) + 4.5 days (Pre-Production) + 3 days (Production Phase 1) + 3 days (Production Phase 2) + 8 days (Post-Production) = **27.5 days**\n","Okay, let's break down the project timeline based on an 8-hour workday.\n\n**Phase Durations:**\n\n1. **Writing:**\n * Screenwriter: 72 hours\n * Days: 72 hours / 8 hours/day = **9 days**\n\n2. **Pre-Production:**\n * Director: 36 hours\n * Camera Operator (with Director): 24 hours\n * The longest duration here is the Director's 36 hours.\n * Days: 36 hours / 8 hours/day = **4.5 days**\n\n3. **Production Phase 1:**\n * The phase is explicitly stated to last for **3 days**.\n * (Director, 4 actors, camera operator, sound engineer all work these 3 days)\n\n4. **Production Phase 2:**\n * The phase is explicitly stated to last for **3 days**.\n * (Director, 8 actors, camera operator, sound engineer all work these 3 days)\n\n5. **Post-Production:**\n * Editor: 64 hours\n * Director (with Editor): 24 hours\n * The longest duration here is the Editor's 64 hours.\n * Days: 64 hours / 8 hours/day = **8 days**\n\n**Summary of Days per Phase:**\n\n* **Writing:** 9 days\n* **Pre-Production:** 4.5 days\n* **Production Phase 1:** 3 days\n* **Production Phase 2:** 3 days\n* **Post-Production:** 8 days\n\n**Total Number of Days for the Entire Project:**\n\nTotal Days = 9 (Writing) + 4.5 (Pre-Production) + 3 (Production 1) + 3 (Production 2) + 8 (Post-Production)\nTotal Days = **27.5 days**",Both responses correctly calculate the project timeline based on an 8-hour workday and provide the same final answer.,TIE


## 🏆 Preferred Model

Unnamed: 0,question_answering_quality/pairwise_choice
0,TIE


## 📝 Explanation

Unnamed: 0,question_answering_quality/explanation
0,Both responses correctly calculate the project timeline based on an 8-hour workday and provide the same final answer.


# Redo from Task 3

In [82]:
eval_dataset = pd.DataFrame([{"prompt": prompt}])

In [83]:
from vertexai.evaluation import PairwiseMetric, PairwiseMetricPromptTemplate

metric_flash_baseline = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_flash,
    metric_prompt_template=PairwiseMetricPromptTemplate(
        criteria={"overall_quality": "Which response better answers the question based on accuracy, clarity, and completeness?"},
        rating_rubric={"A": "Candidate", "B": "Baseline", "tie": "Equally good"}
    )
)

INFO:vertexai.evaluation.metrics.metric_prompt_template:The `input_variables` parameter is empty. Only the `response` and `baseline_model_response` columns are used for computing this model-based metric.


In [84]:
from vertexai.evaluation import EvalTask

eval_task_pro = EvalTask(
    dataset=eval_dataset,
    metrics=[metric_flash_baseline],  # ✅ compare pro vs flash
    experiment="indie-film-planning"
)

In [85]:
import datetime

run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

eval_result_pro = eval_task_pro.evaluate(
    model=llm_pro,
    prompt_template="{prompt}",
    experiment_run_name=f"eval-pro-vs-flash-{run_ts}"
)

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.5-pro-preview-05-06', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:11<00:00, 11.56s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.5-pro-preview-05-06.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 11.561197023999739 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.92s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are su

In [86]:
# Define pairwise metric with Pro as baseline
metric_pro_baseline = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_pro,
    metric_prompt_template=PairwiseMetricPromptTemplate(
        criteria={"overall_quality": "Which response better answers the question based on accuracy, clarity, and completeness?"},
        rating_rubric={"A": "Candidate", "B": "Baseline", "tie": "Equally good"}
    )
)

eval_task_flash = EvalTask(
    dataset=eval_dataset,
    metrics=[metric_pro_baseline],
    experiment="indie-film-planning"
)

eval_result_flash = eval_task_flash.evaluate(
    model=llm_flash,
    prompt_template="{prompt}",
    experiment_run_name=f"eval-flash-vs-pro-{run_ts}"
)

INFO:vertexai.evaluation.metrics.metric_prompt_template:The `input_variables` parameter is empty. Only the `response` and `baseline_model_response` columns are used for computing this model-based metric.


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.0-flash-001', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.0-flash-001.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 1.6512665710001784 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:12<00:00, 12.52s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully gener

In [87]:
eval_results_to_compare = [eval_result_flash, eval_result_pro]

# Redo from Task 3

In [88]:
eval_dataset = pd.DataFrame([{
    "prompt": prompt
}])

In [89]:
# Shared pairwise metric prompt template
pairwise_prompt_template = PairwiseMetricPromptTemplate(
    criteria={
        "overall_quality": "Which response better answers the question based on accuracy, clarity, and completeness?"
    },
    rating_rubric={
        "A": "Candidate",
        "B": "Baseline",
        "tie": "Equally good"
    }
)


INFO:vertexai.evaluation.metrics.metric_prompt_template:The `input_variables` parameter is empty. Only the `response` and `baseline_model_response` columns are used for computing this model-based metric.


In [90]:
# --------------------------
# 🔁 Evaluate LLM Flash vs Pro
# --------------------------
metric_pro_as_baseline = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_pro,
    metric_prompt_template=pairwise_prompt_template
)


In [91]:
eval_task_flash = EvalTask(
    dataset=eval_dataset,
    metrics=[metric_pro_as_baseline],
    experiment="indie-film-planning"
)

In [92]:
eval_result_flash = eval_task_flash.evaluate(
    model=llm_flash,
    prompt_template="{prompt}",
    experiment_run_name="flash-vs-pro-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
)

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.0-flash-001', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.69s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.0-flash-001.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 1.6936725870000373 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:11<00:00, 11.59s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully gener

In [93]:
# --------------------------
# 🔁 Evaluate LLM Pro vs Flash
# --------------------------
metric_flash_as_baseline = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_flash,
    metric_prompt_template=pairwise_prompt_template
)

eval_task_pro = EvalTask(
    dataset=eval_dataset,
    metrics=[metric_flash_as_baseline],
    experiment="indie-film-planning"
)

eval_result_pro = eval_task_pro.evaluate(
    model=llm_pro,
    prompt_template="{prompt}",
    experiment_run_name="pro-vs-flash-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
)

# ✅ Store the results (very important for the lab check)
eval_results_to_compare = [eval_result_flash, eval_result_pro]

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.5-pro-preview-05-06', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:11<00:00, 11.59s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.5-pro-preview-05-06.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 11.596406617999946 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are su

# Redo from Task 3

In [105]:
# Create the prompt
eval_dataset = pd.DataFrame([{"prompt": prompt}])

# Shared evaluation prompt
pairwise_prompt = PairwiseMetricPromptTemplate(
    criteria={
        "overall_quality": "Which response better answers the question based on accuracy, clarity, and completeness?"
    },
    rating_rubric={
        "A": "Candidate",
        "B": "Baseline",
        "tie": "Equally good"
    }
)

# Step 1: Flash vs Pro
metric_flash = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_pro,
    metric_prompt_template=pairwise_prompt
)

eval_task_flash = EvalTask(
    dataset=eval_dataset,
    metrics=[metric_flash],
    experiment="indie-film-planning"
)

eval_result_flash = eval_task_flash.evaluate(
    model=llm_flash,
    prompt_template="{prompt}",
    experiment_run_name="flash-vs-pro-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
)

# Step 2: Pro vs Flash
metric_pro = PairwiseMetric(
    metric="question_answering_quality",
    baseline_model=llm_flash,
    metric_prompt_template=pairwise_prompt
)

eval_task_pro = EvalTask(
    dataset=eval_dataset,
    metrics=[metric_pro],
    experiment="indie-film-planning"
)

eval_result_pro = eval_task_pro.evaluate(
    model=llm_pro,
    prompt_template="{prompt}",
    experiment_run_name="pro-vs-flash-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
)

# Store BOTH results
eval_results_to_compare = []
eval_results_to_compare = [eval_result_flash, eval_result_pro]

INFO:vertexai.evaluation.metrics.metric_prompt_template:The `input_variables` parameter is empty. Only the `response` and `baseline_model_response` columns are used for computing this model-based metric.


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.0-flash-001', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.0-flash-001.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 1.622602820999873 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:11<00:00, 11.36s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully genera

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '{prompt}', 'model_name': 'publishers/google/models/gemini-2.5-pro-preview-05-06', 'temperature': 0}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.5-pro-preview-05-06.
100%|██████████| 1/1 [00:12<00:00, 12.20s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are successfully generated from Gemini model gemini-2.5-pro-preview-05-06.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 12.204464918000212 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 1 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
INFO:vertexai.evaluation._evaluation:All 1 responses are su