### Install dependencies

In [17]:
from google.adk.tools import ToolContext
%pip install --upgrade --quite 'google-adk'
%pip install --upgrade --quite 'google-cloud-aiplatform[evaluation]'

%pip install python-dotenv
%pip install plotly


Usage:   
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] <requirement specifier> [package-index-options] ...
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] -r <requirements file> [package-index-options] ...
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] [-e] <vcs project url> ...
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] [-e] <local project path> ...
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] <archive url/path> ...

no such option: --quite
Note: you may need to restart the kernel to use updated packages.

Usage:   
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] <requirement specifier> [package-index-options] ...
  /Users/lingyimu/Projects/mock-interview-bot/.venv/bin/python -m pip install [options] -r <requirements file> [

### GCloud project info setup

In [18]:
import os
from dotenv import load_dotenv
import vertexai

# directly read from .env
load_dotenv(dotenv_path="../.env")
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION")
BUCKET_URI=f"gs://{os.environ.get("BUCKET_NAME")}"

# set .env by default
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

EXPERIMENT_NAME="evaluate-mock-interview-agent"

vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)

assert PROJECT_ID
assert LOCATION
assert BUCKET_URI

### Imports

In [19]:
import json

# General
import random
import string
from typing import Any

from IPython.display import HTML, Markdown, display
from google.adk.agents import Agent

# Build agent with adk
from google.adk.events import Event
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService
from google.adk.tools import ToolContext, transfer_to_agent

# Evaluate agent
from google.cloud import aiplatform
from google.genai import types
import pandas as pd
import plotly.graph_objects as go
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation.metrics import (
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    TrajectorySingleToolUse,
)

### Helper functions

In [20]:
def get_id(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def parse_adk_output_to_dictionary(events: list[Event]) -> dict[str, Any]:
    """
    Parse ADK event output into a structured dictionary format,
    with the predicted trajectory dumped as a JSON string.

    """

    final_response = ""
    predicted_trajectory_list = []

    for event in events:
        # Ensure content and parts exist before accessing them
        if not event.content or not event.content.parts:
            continue

        # Iterate through ALL parts in the event's content
        for part in event.content.parts:
            if part.function_call:
                tool_info = {
                    "tool_name": part.function_call.name,
                    "tool_input": dict(part.function_call.args),
                }
                # Ensure we don't add duplicates if the same call appears somehow
                if tool_info not in predicted_trajectory_list:
                    predicted_trajectory_list.append(tool_info)

            # The final text response is usually in the last event from the model
            if event.content.role == "model" and part.text:
                # Overwrite response; the last text response found is likely the final one
                final_response = part.text.strip()

    # Dump the collected trajectory list into a JSON string
    final_output = {
        "response": str(final_response),
        "predicted_trajectory": json.dumps(predicted_trajectory_list),
    }

    return final_output


def format_output_as_markdown(output: dict) -> str:
    """Convert the output dictionary to a formatted markdown string."""
    markdown = "### AI Response\n"
    markdown += f"{output['response']}\n\n"

    if output["predicted_trajectory"]:
        output["predicted_trajectory"] = json.loads(output["predicted_trajectory"])
        markdown += "### Function Calls\n"
        for call in output["predicted_trajectory"]:
            markdown += f"- **Function**: `{call['tool_name']}`\n"
            markdown += "  - **Arguments**:\n"
            for key, value in call["tool_input"].items():
                markdown += f"    - `{key}`: `{value}`\n"

    return markdown


def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient="index").T
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    display(Markdown("### Row-wise Metrics"))
    display(eval_result.metrics_table)


def display_drilldown(row: pd.Series) -> None:
    """Displays a drill-down view for trajectory data within a row."""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"

    if not (
        isinstance(row["predicted_trajectory"], list)
        and isinstance(row["reference_trajectory"], list)
    ):
        return

    for predicted_trajectory, reference_trajectory in zip(
        row["predicted_trajectory"], row["reference_trajectory"]
    ):
        display(
            HTML(
                f"<h3>Tool Names:</h3><div style='{style}'>{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}</div>"
            )
        )

        if not (
            isinstance(predicted_trajectory.get("tool_input"), dict)
            and isinstance(reference_trajectory.get("tool_input"), dict)
        ):
            continue

        for tool_input_key in predicted_trajectory["tool_input"]:
            print("Tool Input Key: ", tool_input_key)

            if tool_input_key in reference_trajectory["tool_input"]:
                print(
                    "Tool Values: ",
                    predicted_trajectory["tool_input"][tool_input_key],
                    reference_trajectory["tool_input"][tool_input_key],
                )
            else:
                print(
                    "Tool Values: ",
                    predicted_trajectory["tool_input"][tool_input_key],
                    "N/A",
                )
        print("\n")
    display(HTML("<hr>"))


def display_dataframe_rows(
    df: pd.DataFrame,
    columns: list[str] | None = None,
    num_rows: int = 3,
    display_drilldown: bool = False,
) -> None:
    """Displays a subset of rows from a DataFrame, optionally including a drill-down view."""

    if columns:
        df = df[columns]

    base_style = "font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;"
    header_style = base_style + "font-weight: bold;"

    for _, row in df.head(num_rows).iterrows():
        for column in df.columns:
            display(
                HTML(
                    f"<span style='{header_style}'>{column.replace('_', ' ').title()}: </span>"
                )
            )
            display(HTML(f"<span style='{base_style}'>{row[column]}</span><br>"))

        display(HTML("<hr>"))

        if (
            display_drilldown
            and "predicted_trajectory" in df.columns
            and "reference_trajectory" in df.columns
        ):
            display_drilldown(row)


def plot_bar_plot(
    eval_result: pd.DataFrame, title: str, metrics: list[str] = None
) -> None:
    fig = go.Figure()
    data = []

    summary_metrics = eval_result.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    data.append(
        go.Bar(
            x=list(summary_metrics.keys()),
            y=list(summary_metrics.values()),
            name=title,
        )
    )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()


def display_radar_plot(eval_results, title: str, metrics=None):
    """Plot the radar plot."""
    fig = go.Figure()
    summary_metrics = eval_results.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    min_val = min(summary_metrics.values())
    max_val = max(summary_metrics.values())

    fig.add_trace(
        go.Scatterpolar(
            r=list(summary_metrics.values()),
            theta=list(summary_metrics.keys()),
            fill="toself",
            name=title,
        )
    )
    fig.update_layout(
        title=title,
        polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),
        showlegend=True,
    )
    fig.show()

## Build agent problem-generator

### Tools

In [21]:
def write_problem_to_session(problem_json: str, tool_context: ToolContext):
    """
    Use this tool after generating the problem JSON.
    Formats the problem into a dictionary and stores it in the session state
    Args:
        problem_json (str): The string representing the problem JSON.
    """
    problem = json.loads(problem_json)
    key_stem = "temp:problem"
    for key, value in problem.items():
        tool_context.state[f"{key_stem}_{key}"] = value
    print("After update: ")
    print(tool_context.state)
    return {"success": True, "problem": problem}


### The agent

In [26]:
model = "gemini-2.0-flash"


def get_session(app_name, user_id, session_id):
    session_service = InMemorySessionService()
    session = session_service.create_session(
        app_name=app_name, user_id=user_id, session_id=session_id)
    return session_service, session


def get_agent(query):
    agent = Agent(
        name="ProblemGenerator",
        model = model,
        description = "An agent that generates LeetCode-style coding questions.",
        instruction=f"""
        Analyze this user request: '{query}'. Generate a Python dictionary describing a LeetCode-style coding question. After successful dict generation, use tool write_problem_to_session.

        The dict should have these following keys:
        "description": Generate a LeetCode-style coding question at the user's specified difficulty level in the user-specified topic if possible. Do NOT include any assumptions. Do NOT include text that's obviously a hint for any solution.
        "assumptions": A list of assumptions that user can make about the problem.
        "valid_input": Describe what is considered valid input for this question.
        "examples": Give 1-3 simple examples of inputs and expected outputs.
        "constraints": Optionally specify ideal time and space constraints.
        "input_size": Describe sizes for any variables/inputs in the problem.
        """,
        tools=[write_problem_to_session]
    )
    return agent


### Test agent

In [28]:
app_name = "mock_interview_conductor"
user_id = "user_one"
session_id = "session_one"

query = "Difficulty level: easy, topic: two-pointer"

session_service, session = get_session(app_name, user_id, session_id)
agent = get_agent(query)

runner = Runner(
    agent=agent, app_name=app_name,
    session_service=session_service
)
content = types.Content(role="user", parts=[types.Part(text=query)])
events = list(
    runner.run(user_id=user_id, session_id=session_id, new_message=content)
)


response = parse_adk_output_to_dictionary(events)
display(Markdown(format_output_as_markdown(response)))

### AI Response
```json
{
  "description": "Given a sorted array of integers, determine if there exist two numbers such that they add up to a specific target value. Return true if such a pair exists, and false otherwise.",
  "assumptions": [
    "The input array is sorted in ascending order.",
    "The input array contains only integers."
  ],
  "valid_input": "A sorted array of integers and an integer target value.",
  "examples": [
    {
      "input": {
        "array": [2, 7, 11, 15],
        "target": 9
      },
      "output": true
    },
    {
      "input": {
        "array": [2, 7, 11, 15],
        "target": 10
      },
      "output": true
    },
    {
      "input": {
        "array": [2, 7, 11, 15],
        "target": 5
      },
      "output": false
    }
  ],
  "constraints": "Time complexity: O(n), Space complexity: O(1)",
  "input_size": "The array can contain up to 10^4 elements. The target value can be any integer."
}
```

### Function Calls


## Evaluate agent
The tool `write_problem_to_session` should always be called.