In [1]:
%pip install --upgrade --quiet google-cloud-aiplatform google-cloud-aiplatform[evaluation]

# Welcome to Dr abhishek Cloud Tutorials to Like the Video & Subscribe The Channel -- https://www.youtube.com/@drabhishek.5460/videos

In [None]:
%pip install --upgrade --quiet google-genai nest-asyncio==1.5.9



import pandas as pd
from inspect import cleandoc
from IPython.display import display, Markdown

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
)

pd.set_option("display.max_colwidth", None)



import nest_asyncio
import pandas as pd
from inspect import cleandoc
from IPython.display import display, Markdown
import vertexai
from vertexai.preview.evaluation import EvalTask
from vertexai.generative_models import GenerativeModel, GenerationConfig

nest_asyncio.apply()



# Get the current Google Cloud project ID
PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0].strip()  # Added strip() to remove any whitespace

# Set the location/region
LOCATION = "us-central1"

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

## # Welcome to Dr abhishek Cloud Tutorials to Like the Video & Subscribe The Channel -- https://www.youtube.com/@drabhishek.5460/videos

hourly_rates = cleandoc("""
Screenwriter: $40
Actor: $25
Director: $30
Camera Operator: $35
Sound Engineer: $20
Editor: $30
""")

planning_notes = cleandoc("""
Phases of Production:
  Writing:
  The Screenwriter will write the script.
  They need 72 hours to do so.

  Pre-Production:
  The Director needs time to analyze the script.
  They will work on it for 36 hours.
  The Camera Operator will join the director for 24 hours of planning.

  Production Phase 1:
  The first three days of filming will require the director, 4 actors, the camera operator, and the sound engineer.

  Production Phase 2:
  The next three days of filming will require the director, 8 actors, the camera operator, and the sound engineer.

  Post-Production:
  The editor will take 64 hours to edit the film.
  The director will work with the editor for 24 hours during this phase.
""")

# # Welcome to Dr abhishek Cloud Tutorials to Like the Video & Subscribe The Channel -- https://www.youtube.com/@drabhishek.5460/videos


tasks = [
    """What is the cost of each phase of production?
    If days are mentioned, assume an 8 hour work day.""",

    """How many days will each phase require? Assume an
    8 hour work day. If multiple people are working in parallel,
    do not add those times together, but only use the longest time.
    Also include a count of the total number of days of the entire
    project.""",

    """Prepare a text schedule for all phases of the film starting
    on Feb 3, 2025. The whole crew should be off Saturdays
    and Sundays."""
]

# # Welcome to Dr abhishek Cloud Tutorials to Like the Video & Subscribe The Channel -- https://www.youtube.com/@drabhishek.5460/videos


prompt_template = cleandoc("""
<instructions>
Prepare a document to fulfill the task based on the context provided.
</instructions>
<task>
{task}
</task>
<context>
{context}
</context>
""")




config = GenerationConfig(temperature=0)

llm_pro = GenerativeModel(
    "gemini-2.5-pro",  # Using gemini-2.5-pro as specified
    generation_config=config
)

llm_flash = GenerativeModel(
    "gemini-2.0-flash-001",  # Using gemini-2.0-flash-001 as specified
    generation_config=config
)



context = hourly_rates + "\n" + planning_notes



# Generate and display content from llm_pro
display(Markdown(
    llm_pro.generate_content(
        prompt_template.format(
            task=tasks[1],
            context=context
        )
    ).text
))

# Generate and display content from llm_flash
display(Markdown(
    llm_flash.generate_content(
        prompt_template.format(
            task=tasks[1],
            context=context
        )
    ).text
))



# Create evaluation dataframe
eval_df = pd.DataFrame({
    "instruction": tasks,  # Direct tasks use karo
    "context": [context] * len(tasks),  # Same context for all tasks
})

print("Evaluation DataFrame created successfully:")
print(eval_df.head())
print(f"DataFrame shape: {eval_df.shape}")





qa_eval_task = EvalTask(
    dataset=eval_df,
    metrics=[MetricPromptTemplateExamples.Pairwise.QUESTION_ANSWERING_QUALITY],
    experiment="indie-film-planning",
)

print("EvalTask created successfully!")




evaluations = {}

prompt_template = (
    "Answer the question based on the context.\n"
    "Context: {context}\n"
    "Question: {instruction}"
)

evaluations["pro"] = qa_eval_task.evaluate(
    model=llm_pro,
    prompt_template=prompt_template
)

evaluations["flash"] = qa_eval_task.evaluate(
    model=llm_flash,
    prompt_template=prompt_template
)




# Display evaluation results for the pro model
display(Markdown(
    evaluations["pro"].metrics_table.response[1]
))

# Display evaluation results for the flash model
display(Markdown(
    evaluations["flash"].metrics_table.response[1]
))






# Task 3: Prepare the Evaluation Dataset and EvalTask

# First, let's generate responses from both models for all tasks
responses_pro = []
responses_flash = []

print("Generating responses from both models...")

# Fix the prompt template - change {instruction} to {task}
corrected_prompt_template = cleandoc("""
<instructions>
Prepare a document to fulfill the task based on the context provided.
</instructions>
<task>
{task}
</task>
<context>
{context}
</context>
""")

for i, task in enumerate(tasks):
    print(f"Generating response for task {i+1}...")

    try:
        # Generate response from Gemini Pro
        response_pro = llm_pro.generate_content(
            corrected_prompt_template.format(task=task, context=context)
        ).text
        responses_pro.append(response_pro)

        # Generate response from Gemini Flash
        response_flash = llm_flash.generate_content(
            corrected_prompt_template.format(task=task, context=context)
        ).text
        responses_flash.append(response_flash)

        print(f"✓ Task {i+1} completed")

    except Exception as e:
        print(f"Error generating response for task {i+1}: {e}")
        # Add empty responses as fallback
        responses_pro.append("")
        responses_flash.append("")

print("Responses generated successfully!")

# Create evaluation dataset with the required structure
eval_data = []
for i, task in enumerate(tasks):
    eval_data.append({
        'instruction': task,
        'baseline_model_response': responses_pro[i],
        'candidate_model_response': responses_flash[i],
        'context': context
    })

eval_df = pd.DataFrame(eval_data)
print("Evaluation DataFrame created:")
print(eval_df.head())

# Create the EvalTask with pairwise metric
try:
    # Method 1: Try using the metric string directly (most reliable)
    eval_task = EvalTask(
        dataset=eval_df,
        metrics=["pairwise/question_answering_quality"],
        experiment="indie-film-planning"
    )

    print("EvalTask created successfully with metric string!")

except Exception as e:
    print(f"Error with metric string: {e}")
    print("Trying alternative approach...")

    # Method 2: Try using PairwiseMetric
    try:
        pairwise_metric = PairwiseMetric(
            metric_prompt_template=MetricPromptTemplateExamples.Pairwise.QUESTION_ANSWERING_QUALITY
        )

        eval_task = EvalTask(
            dataset=eval_df,
            metrics=[pairwise_metric],
            experiment="indie-film-planning"
        )

        print("EvalTask created successfully with PairwiseMetric!")

    except Exception as e2:
        print(f"Error with PairwiseMetric: {e2}")
        print("Creating basic EvalTask without specific metric...")

        # Method 3: Basic EvalTask
        eval_task = EvalTask(
            dataset=eval_df,
            experiment="indie-film-planning"
        )

        print("Basic EvalTask created!")

  from google.cloud.aiplatform.utils import gcs_utils


Based on the context provided, here is the breakdown of the days required for each phase of the project, assuming an 8-hour workday.

### **Phase Durations**

*   **Writing:** 9 Days
    *   The Screenwriter requires 72 hours.
    *   (72 hours / 8 hours per day = 9 days)

*   **Pre-Production:** 4.5 Days
    *   The Director works for 36 hours, and the Camera Operator works for 24 hours in parallel. The longest duration is used.
    *   (36 hours / 8 hours per day = 4.5 days)

*   **Production Phase 1:** 3 Days
    *   The context explicitly states this phase will take three days.

*   **Production Phase 2:** 3 Days
    *   The context explicitly states this phase will take three days.

*   **Post-Production:** 8 Days
    *   The Editor works for 64 hours, and the Director works for 24 hours in parallel. The longest duration is used.
    *   (64 hours / 8 hours per day = 8 days)

---

### **Total Project Duration**

The total number of days for the entire project is **27.5 days**.

(9 + 4.5 + 3 + 3 + 8 = 27.5)

## Project Timeline Breakdown (Based on 8-Hour Workdays)

Here's a breakdown of the project timeline, calculating the number of days required for each phase and the total project duration:

**Phase Breakdown:**

*   **Writing:**
    *   Screenwriter: 72 hours
    *   Days: 72 hours / 8 hours/day = **9 days**

*   **Pre-Production:**
    *   Director: 36 hours
    *   Camera Operator: 24 hours
    *   Since the Director and Camera Operator are working in parallel, we take the longest duration.
    *   Days: 36 hours / 8 hours/day = **4.5 days**

*   **Production Phase 1:**
    *   Duration: 3 days
    *   Days: **3 days**

*   **Production Phase 2:**
    *   Duration: 3 days
    *   Days: **3 days**

*   **Post-Production:**
    *   Editor: 64 hours
    *   Director: 24 hours
    *   Since the Editor and Director are working in parallel, we take the longest duration.
    *   Days: 64 hours / 8 hours/day = **8 days**

**Total Project Duration:**

To calculate the total project duration, we sum the days required for each phase:

9 days (Writing) + 4.5 days (Pre-Production) + 3 days (Production Phase 1) + 3 days (Production Phase 2) + 8 days (Post-Production) = **27.5 days**

**Summary:**

*   **Writing:** 9 days
*   **Pre-Production:** 4.5 days
*   **Production Phase 1:** 3 days
*   **Production Phase 2:** 3 days
*   **Post-Production:** 8 days
*   **Total Project Duration:** 27.5 days


Evaluation DataFrame created successfully:
                                                                                                                                                                                                                                                                 instruction  \
0                                                                                                                                                                       What is the cost of each phase of production?\n    If days are mentioned, assume an 8 hour work day.   
1  How many days will each phase require? Assume an\n    8 hour work day. If multiple people are working in parallel,\n    do not add those times together, but only use the longest time.\n    Also include a count of the total number of days of the entire\n    project.   
2                                                                                                                                  Prepare a 

INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'prompt_template': 'Answer the question based on the context.\nContext: {context}\nQuestion: {instruction}', 'model_name': 'publishers/google/models/gemini-2.5-pro'}
INFO:vertexai.preview.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.preview.evaluation._pre_eval_utils:Generating a total of 3 responses from Gemini model gemini-2.5-pro.
100%|██████████| 3/3 [00:15<00:00,  5.25s/it]
INFO:vertexai.preview.evaluation._pre_eval_utils:All 3 responses are successfully generated from model.
INFO:vertexai.preview.evaluation._evaluation:Multithreaded Batch Inference took: 15.754659776000153 seconds.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 3 Vertex Gen AI Evaluation Service API requests.
  0%|          | 0/3 [00:00<?, ?i

INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'prompt_template': 'Answer the question based on the context.\nContext: {context}\nQuestion: {instruction}', 'model_name': 'publishers/google/models/gemini-2.0-flash-001'}
INFO:vertexai.preview.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.preview.evaluation._pre_eval_utils:Generating a total of 3 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 3/3 [00:03<00:00,  1.19s/it]
INFO:vertexai.preview.evaluation._pre_eval_utils:All 3 responses are successfully generated from model.
INFO:vertexai.preview.evaluation._evaluation:Multithreaded Batch Inference took: 3.5909150130000853 seconds.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 3 Vertex Gen AI Evaluation Service API requests.
  0%|          | 0/3 

Based on the context provided:

*   **Writing:** 9 days (72 hours / 8 hours per day)
*   **Pre-Production:** 4.5 days (The Director's 36 hours is the longest time / 8 hours per day)
*   **Production Phase 1:** 3 days
*   **Production Phase 2:** 3 days
*   **Post-Production:** 8 days (The Editor's 64 hours is the longest time / 8 hours per day)
*   **Total Project Days:** 27.5 days

Here's the breakdown of each phase in days, assuming an 8-hour workday:

*   **Writing:**
    *   Screenwriter: 72 hours / 8 hours/day = 9 days

*   **Pre-Production:**
    *   Director: 36 hours / 8 hours/day = 4.5 days
    *   Camera Operator: 24 hours / 8 hours/day = 3 days
    *   Longest time: 4.5 days

*   **Production Phase 1:**
    *   3 days

*   **Production Phase 2:**
    *   3 days

*   **Post-Production:**
    *   Editor: 64 hours / 8 hours/day = 8 days
    *   Director: 24 hours / 8 hours/day = 3 days
    *   Longest time: 8 days

**Total Project Time:**

9 + 4.5 + 3 + 3 + 8 = 27.5 days

Generating responses from both models...
Generating response for task 1...
✓ Task 1 completed
Generating response for task 2...
✓ Task 2 completed
Generating response for task 3...
✓ Task 3 completed
Responses generated successfully!
Evaluation DataFrame created:
                                                                                                                                                                                                                                                                 instruction  \
0                                                                                                                                                                       What is the cost of each phase of production?\n    If days are mentioned, assume an 8 hour work day.   
1  How many days will each phase require? Assume an\n    8 hour work day. If multiple people are working in parallel,\n    do not add those times together, but only use the longest time.\n    

In [None]:
# Task 4: Run the evaluations and examine results

print("Running evaluations... This may take 2-3 minutes.")

try:
    # Run the evaluation
    evaluation_result = eval_task.evaluate()
    print("Evaluation completed successfully!")

    # Display results
    print("\n" + "="*60)
    print("EVALUATION RESULTS")
    print("="*60)

    # Try to access results in different ways
    if hasattr(evaluation_result, 'result_df') and evaluation_result.result_df is not None:
        print("\nResults DataFrame:")
        display(evaluation_result.result_df)

        # Check for pairwise choice column
        result_columns = evaluation_result.result_df.columns.tolist()
        print(f"\nAvailable columns: {result_columns}")

        # Look for choice-related columns
        choice_columns = [col for col in result_columns if 'choice' in col.lower() or 'preference' in col.lower()]
        explanation_columns = [col for col in result_columns if 'explanation' in col.lower() or 'reason' in col.lower()]

        if choice_columns:
            print(f"\n=== PAIRWISE CHOICES ===")
            for choice_col in choice_columns:
                choices = evaluation_result.result_df[choice_col]
                for i, choice in enumerate(choices):
                    print(f"Task {i+1} ({choice_col}): {choice}")

        if explanation_columns:
            print(f"\n=== EXPLANATIONS ===")
            for exp_col in explanation_columns:
                explanations = evaluation_result.result_df[exp_col]
                for i, explanation in enumerate(explanations):
                    print(f"Task {i+1} Explanation ({exp_col}):")
                    print(explanation)
                    print("-" * 80)

    elif hasattr(evaluation_result, 'metrics_table') and evaluation_result.metrics_table is not None:
        print("\nMetrics Table:")
        display(evaluation_result.metrics_table)

    else:
        # Print all available attributes
        print("\nAvailable attributes in evaluation_result:")
        for attr in dir(evaluation_result):
            if not attr.startswith('_'):
                attr_value = getattr(evaluation_result, attr)
                if attr_value is not None and not callable(attr_value):
                    print(f"{attr}: {type(attr_value)}")

        # Try to print the evaluation result directly
        print("\nRaw evaluation result structure:")
        print(evaluation_result)

except Exception as e:
    print(f"Error during evaluation: {e}")
    print("\nTrying alternative evaluation approach...")

    # Fallback: Simple manual analysis
    print("\n" + "="*60)
    print("MANUAL COMPARISON ANALYSIS")
    print("="*60)

    for i in range(len(tasks)):
        print(f"\n{'='*50}")
        print(f"TASK {i+1}")
        print(f"{'='*50}")
        print(f"Instruction: {tasks[i][:100]}...")

        pro_len = len(responses_pro[i])
        flash_len = len(responses_flash[i])

        print(f"\nGemini Pro Response Length: {pro_len} characters")
        print(f"Gemini Flash Response Length: {flash_len} characters")

        # Simple comparison based on length and content quality indicators
        if pro_len > flash_len:
            print("✓ Gemini Pro provided more detailed response")
        elif flash_len > pro_len:
            print("✓ Gemini Flash provided more detailed response")
        else:
            print("○ Both responses have similar length")

        # Check for key elements in responses
        pro_has_numbers = any(char.isdigit() for char in responses_pro[i])
        flash_has_numbers = any(char.isdigit() for char in responses_flash[i])

        if pro_has_numbers and not flash_has_numbers:
            print("✓ Gemini Pro included numerical data")
        elif flash_has_numbers and not pro_has_numbers:
            print("✓ Gemini Flash included numerical data")

        print(f"\nSample of Gemini Pro response:")
        print(responses_pro[i][:200] + "..." if len(responses_pro[i]) > 200 else responses_pro[i])

        print(f"\nSample of Gemini Flash response:")
        print(responses_flash[i][:200] + "..." if len(responses_flash[i]) > 200 else responses_flash[i])

# Final summary
print("\n" + "="*60)
print("EVALUATION COMPLETED")
print("="*60)
print("Check the results above to determine which model performs better.")
print("Look for metrics like 'pairwise_choice' or similar columns that indicate preference.")
print("If automated evaluation failed, use the manual comparison to make your judgment.")