In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Building a Multi-Agent PlanGEN Frameworks using Vertex AI API for NATURAL PLAN

| | |
|-|-|
|Author(s) | Mihir Parmar, Hamid Palangi |

### Overview of PlanGEN

[PlanGEN](https://www.arxiv.org/pdf/2502.16111) is a model-agnostic and easily scalable multi-agent framework with three key components: constraint, verification, and selection agents. Specifically, our approach proposes constraint-guided iterative verification to enhance performance of inference-time algorithms--Best of N, Tree-of-Thought, and REBASE. In PlanGEN framework, the selection agent optimizes algorithm choice based on instance complexity, ensuring better adaptability to complex planning problems. We combine our agents with the inference-time algorithms, yielding four frameworks: (1) PlanGEN (Best of N), (2) PlanGEN (ToT), (3) PlanGEN (REBASE), and (4) PlanGEN (Mixture of Algorithms). Our key finding highlights that constraint-guided iterative verification improves inference-time algorithms, and adaptive selection further boosts performance on complex planning and reasoning problems. In this notebook, we provide implementation of these four frameworks for calendar scheduling and trip planning tasks from NATURAL PLAN.

### Objectives

In this tutorial, you will build PlanGEN frameworks using the Vertex AI API and Python. You will develop four frameworks mentioned above using Gemini-1.5-Pro-002, and Gemini-2.0-Flash-exp as underlying models. You will be using example instances from calendar scheduling and trip planning tasks from [NATURAL PLAN](https://arxiv.org/abs/2406.04520) for this tutorial.

- Install the Vertex AI SDK for Python
- Define a model for your framework
- Define necessary Python libraries and functions for PlanGEN frameworks
- Test your framework locally on two tasks from NATURAL PLAN

### Enable APIs and Services

This tutorial uses the following billable components of Google Cloud, which you'll need to enable for this tutorial:

- [Enable Vertex AI API](https://console.cloud.google.com/apis/api/aiplatform.googleapis.com/overview)

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Getting Started



### Install Vertex AI SDK

> Install the latest version of the Vertex AI SDK for Python as well as extra dependencies related to PlanGEN:

In [None]:
%pip install --upgrade --quiet \
"google-cloud-aiplatform[langchain,reasoningengine]" \
retrying

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.7/153.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

### Restart current runtime

> To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

### Authenticate your notebook environment (Colab only)

> If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using Vertex AI Workbench.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

> To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://pantheon.corp.google.com/apis/enableflow?apiid=aiplatform.googleapis.com&project=research-01-268019).

> Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "research-01-268019"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig, HarmCategory, SafetySetting
from retrying import retry

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Define Underlying Model for PlanGEN

> Please specify the underlying model for PlanGEN frameworks

In [None]:
model_name= "gemini-1.5-pro-002"

### Function for Calling Defined Model

> This code defines a function that sends a prompt to a Gemini and retrieves a response. It includes features like retrying the request if it fails, controlling the randomness and format of the response, and disabling safety filters.

In [None]:
# Define safety settings as a list of SafetySetting objects
safety_settings = [
    SafetySetting(category=HarmCategory.HARM_CATEGORY_UNSPECIFIED,threshold="BLOCK_NONE"),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_HARASSMENT, threshold="BLOCK_NONE"),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold="BLOCK_NONE"),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold="BLOCK_NONE"),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold="BLOCK_NONE")
]

@retry(wait_exponential_multiplier=16000, wait_exponential_max=256000, stop_max_attempt_number=5)
def call_gemini(prompt, model_name, temperature=0, candidate_count=None, stop_sequences=None, response_type=None, response_schema=None):
    try:
        attempts = getattr(call_gemini, 'retry_count', 0) + 1

        model = GenerativeModel(model_name)
        generation_config = GenerationConfig(temperature=temperature, candidate_count= candidate_count, stop_sequences=stop_sequences, response_mime_type=response_type, response_schema=response_schema)

        response = model.generate_content(prompt, generation_config=generation_config, safety_settings=safety_settings)

        if hasattr(response, 'text'):
            return response.text
        else:
            return " "

    except Exception as e:
        print(f"Error occurred on attempt {attempts}: {e}")
        raise

# Implementation of PlanGEN (Best of N)

> We adapted the Best of N algorithm and modified it using our constraint and verification agents.  The framework generates N candidate plans (Plan 1, Plan 2, ..., Plan n), and each plan is assessed by a verification agent based on a set of constraints. Then, a corresponding reward (Reward 1, Reward 2, ..., Reward n) gets assigned by the verification agent. Finally, the plan with the maximum reward is chosen, guaranteeing an optimal solution that best satisfies the problem constraints.

**Import necessary libraries for PlanGEN (Best of N)**



In [None]:
from os.path import join
import json
from tqdm import tqdm
import pandas as pd

**Class for PlanGEN (Best of N)**

In [None]:
class SelfConsistencyEvaluator:
    def __init__(self, verification_prompt=None, model_name=None):
        self.verification_prompt = verification_prompt
        self.model_name = model_name
        self.evaluate_system_prompt = """Provide a reward score between -100 and 100 for the plan quality, using very strict standards. Do not give a full score above 95. Make sure the reward score is an integer.

Input:
{input}
Generated plan to evaluate:
{response}

Consider below constraints while evaluating:
{verification_prompt}

Make sure to check all the constraints before giving the reward score.

Please provide a reward in the below format:
[step-by-step reasoning for the reward score]
Score: [Strictly provide the reward score as an integer between -100 and 100]"""

    def calculate_reward(self, prompt, output):
        # Get reward score from Gemini
        reward_output = call_gemini(
            self.evaluate_system_prompt.format(
                input=prompt,
                response=output,
                verification_prompt=self.verification_prompt
            ),
            model_name=self.model_name
        )

        # Parse reward and ensure it's an integer between -100 and 100
        try:
            reward = int(reward_output.split('Score:')[-1].strip())
            reward = max(-100, min(100, reward))  # Clamp between -100 and 100
        except:
            reward = -100

        return (output, reward_output, reward)

    def get_best_response(self, prompt, no_of_times=5):
        lst_of_resp = []

        for i in range(no_of_times):
            output = call_gemini(prompt, temperature=0.7, model_name=self.model_name)
            out_with_reward = self.calculate_reward(prompt, output)
            lst_of_resp.append(out_with_reward)

        # Sort responses by reward score and get the best one
        sorted_responses = sorted(lst_of_resp, key=lambda x: x[2], reverse=True)
        best_response = sorted_responses[0][0]

        return sorted_responses, best_response

# Implementation of PlanGEN (Tree-of-Thought)

> Tree-of-Thought (ToT) algorithm has been studied in detail for solving many complex problems. We modified the ToT algorithm with our constraint and verification agents. The method begins by initializing a root node that represents the problem and generating multiple potential next steps, creating a tree-like structure. The generated steps are verified using a verification agent which assigns reward scores based on a set of constraints. The iterative process involves evaluating all possible steps at a given depth, selecting the most promising path based on reward scores, and expanding it further by generating new steps. This process continues until a valid solution is identified or a pre-defined limit on iterations is reached.

**Import necessary libraries for PlanGEN (Tree-of-Thought)**

In [None]:
from __future__ import annotations
from typing import Tuple
import random
import math
from collections import deque
from enum import Enum
from pydantic import BaseModel
import tqdm
import numpy as np

ROOT_UCT_SCORE = 10_000

**Important Prompts**

In [None]:
STEP_PROMPT = """You are an expert assistant for generating step-by-step plan to solve a given question using specified tools. Given the problem and any intermediate steps, output only the next step in the plan. Ensure that the next action helps in moving toward the correct plan to solve the given question. Do not provide the full plan. Keep responses concise, focusing solely on the immediate next step that is most effective in progressing toward the correct plan.

<problem>
{problem}
</problem>

<intermediate_step>
{par_str}
</intermediate_step>"""

REWARD_PROMPT = """Provide a reward score between -100 and 100 for the intermediate plan steps quality, using very strict standards. Do not give a full score above 95. Ensure the reward reflects how well the intermediate plan steps contributes to progressing toward the correct solution.

<problem>
{problem}
</problem>

<intermediate_step>
{par_str}
</intermediate_step>

Consider below constraints while evaluating:
{verification_prompt}

Make sure to check all the constraints before giving the reward score.

Please provide a reward in the below format:
[step-by-step reasoning for the reward score]
Score: [Strictly provide the reward score as an integer between -100 and 100]"""

COMPLETE_PROMPT = """You are an assistant tasked with verifying if the final, complete plan to solve the given question has been achieved within the intermediate steps. Output only '1' if the intermediate steps contain the full solution needed to solve the question. If the full plan has not yet been reached, output only '0'. Provide no additional commentary—return exclusively '1' or '0'.

<problem>
{problem}
</problem>

<intermediate_step>
{par_str}
</intermediate_step>"""

**Classes for PlanGEN (ToT)**

In [None]:
class Node(BaseModel):
    step: str = ''
    par_str: str = ''
    parent: Node | None = None
    children: list[Node] = []
    reward: int = 0
    complete: bool = False

    def add_child(self, child_node: Node):
        self.children.append(child_node)

    def add_children(self, child_nodes: list[Node]):
        self.children.extend(child_nodes)

    def add_reward(self, reward: int):
        self.reward = reward


class BFS_sbs(BaseModel):
    problem: str
    verification_prompt: str
    no_samples: int = 5
    model_name: str = ""
    is_hf: bool = False
    root: Node = Node(step="Please insert the step here")

    class Config:
        arbitrary_types_allowed = True
        protected_namespaces = ()

    def self_evaluate(self, node: Node):
        """Evaluate the quality of the answer. Sample `num_samples` times and average the results."""
        reward = self._evaluate_answer(node)
        node.add_reward(reward)

    def select_node(self, node):
        """Select a non-fully expanded node with the highest UCT value."""
        candidates: list[Node] = []
        candidates.extend(node.children)

        highest_reward_candidate = max(candidates, key=lambda candidate: candidate.reward)

        return highest_reward_candidate

    def initialize(self):
        self.root = Node(step="",par_ans="")

        children = []
        for i in range(self.no_samples):
            step_prompt= STEP_PROMPT.format(problem=self.problem, par_str="")
            step = call_gemini(step_prompt, model_name=self.model_name, temperature=0.7)
            children.append(Node(step=step, par_str=self.root.par_str+step, parent=self.root))

        self.root.add_children(children)

    def eval_children(self, node):
        for i, node in enumerate(node.children):
            node.reward = self._evaluate_(node)
            node.complete = self._complete_(node)

    def _evaluate_(self, node):
        """Generate a zero-shot answer."""
        reward_prompt = REWARD_PROMPT.format(problem=self.problem, par_str=node.par_str, verification_prompt=self.verification_prompt)
        reward_output = call_gemini(reward_prompt, model_name=self.model_name)
        # Parse reward and ensure it's an integer between -100 and 100
        try:
            reward = int(reward_output.split('Score:')[-1].strip())
            reward = max(-100, min(100, reward))  # Clamp between -100 and 100
        except:
            reward = -100

        return reward

    def _complete_(self, node):
        """Generate a zero-shot answer."""
        complete_prompt = COMPLETE_PROMPT.format(problem=self.problem, par_str=node.par_str)
        complete = call_gemini(complete_prompt, model_name=self.model_name)
        return complete

    def expand_node(self, node):
        children = []
        for i in range(self.no_samples):
            step_prompt = STEP_PROMPT.format(problem=self.problem, par_str=node.par_str)
            step = call_gemini(step_prompt, model_name=self.model_name, temperature=0.7)
            children.append(Node(step=step, par_str=node.par_str +"\n"+step, parent=node))

        node.add_children(children)

    def print_tree(self, node=None, level=0):
        if node is None:
            node = self.root
        print("  " * level + f"Step: {node.step}, Reward: {node.reward}, Complete: {node.complete}")
        for child in node.children:
            self.print_tree(child, level + 1)

    def run(self):
        self.initialize()
        incomplete = True
        current_node = self.root
        max_iterations = 20  # Adjust this value based on your needs
        iteration = 0
        while incomplete and iteration < max_iterations:
            self.eval_children(current_node)
            for child in current_node.children:
                if '1' in str(child.complete):
                    incomplete = False
                    final_answer = ""
                    return child.par_str, final_answer
                iteration += 1

                # If we reach here, return the best child based on reward
                if iteration >= max_iterations:
                    best_child = max(current_node.children, key=lambda x: x.reward)
                    return best_child.par_str, "Solution reached iteration limit"

            current_node = self.select_node(current_node)
            self.expand_node(current_node)

# Implementation of PlanGEN (REBASE)

> The REward BAlanced SEarch (REBASE) method inherits the exploitation and pruning properties of tree search and is well-studied for mathematical reasoning. The framework incorporates a dynamic selection and expansion strategy to iteratively refine solutions. At each depth of the tree, candidate nodes are ranked based on their assigned reward scores (obtained using a verification agent), ensuring that the most promising candidates are explored first. Even steps with lower rewards are considered but with a reducing number of children, meaning that their exploration depth is limited. This hierarchical pruning helps maintain efficiency, thereby reducing unnecessary exploration of weaker nodes. This process continues until either a valid, complete solution is found or a predefined depth or width limit is reached. Also, there is a completion check similar to ToT which identifies nodes that represent complete solutions, enabling REBASE to terminate early once a satisfactory outcome is identified.

**Important Prompts**

In [None]:
STEP_PROMPT = """You are an expert assistant for generating step-by-step plan to solve a given question using specified tools. Given the problem and any intermediate steps, output only the next step in the plan. Ensure that the next action helps in moving toward the correct plan to solve the given question. Do not provide the full plan. Keep responses concise, focusing solely on the immediate next step that is most effective in progressing toward the correct plan.

<problem>
{problem}
</problem>

<intermediate_step>
{par_str}
</intermediate_step>"""

REWARD_PROMPT = """Provide a reward score between -100 and 100 for the intermediate plan steps quality, using very strict standards. Do not give a full score above 95. Ensure the reward reflects how well the intermediate plan steps contributes to progressing toward the correct solution.

<problem>
{problem}
</problem>

<intermediate_step>
{par_str}
</intermediate_step>

Consider below constraints while evaluating:
{verification_prompt}

Make sure to check all the constraints before giving the reward score.

Please provide a reward in the below format:
[step-by-step reasoning for the reward score]
Score: [Strictly provide the reward score as an integer between -100 and 100]"""

COMPLETE_PROMPT = """You are an assistant tasked with verifying if the final, complete plan to solve the given question has been achieved within the intermediate steps. Output only '1' if the intermediate steps contain the full solution needed to solve the question. If the full plan has not yet been reached, output only '0'. Provide no additional commentary—return exclusively '1' or '0'.

<problem>
{problem}
</problem>

<intermediate_step>
{par_str}
</intermediate_step>"""

**Classes for PlanGEN (REBASE)**

In [None]:
class TreeNode:
    """Represents a node in the solution tree, containing a step in the problem-solving process"""
    def __init__(self, id, question, value, score, num_step_tokens=0, parent=None):
        """
        Initialize a TreeNode with:
        id: unique identifier for the node
        question: the original problem statement
        value: current solution state/text
        score: reward score from the model
        num_step_tokens: number of tokens in the step
        parent: reference to parent node
        """
        self.id = id
        self.question = question
        self.value = value
        self.score = score
        self.num_step_tokens = num_step_tokens
        self.parent = parent
        self.leaf = False
        self.cum_tokens = parent.cum_tokens + num_step_tokens if parent else num_step_tokens
        self.depth = parent.depth + 1 if parent else 0

    def check_leaf(self, tree):
        """Use the `_complete_` method from the tree to determine if the node is a leaf."""
        self.leaf = '1' in tree._complete_(self)

    def is_leaf(self):
        """Check if this node represents a complete solution"""
        return self.leaf

    def get_score(self):
        """Return the reward score of this solution step"""
        return self.score


class Tree:
    """Manages the tree structure for exploring solution paths"""
    def __init__(self, problem_statement, verification_prompt, model_name, paras):
        """
        Initialize solution tree with:
        problem_statement: the question to be solved
        model_name: name of the language model to use
        paras: parameters controlling tree expansion
        """
        self.nodes = []  # List of all nodes
        self.problem_statement = problem_statement
        self.verification_prompt = verification_prompt
        self.model_name = model_name
        self.paras = paras
        self.root = TreeNode(0, problem_statement, "", 1.0)  # Create root node
        self.nodes.append(self.root)
        self.depth_nodes = [[] for _ in range(100)]  # Organize nodes by depth
        self.depth_nodes[0].append(self.root)
        self.remaining_width = paras["width"]  # Control breadth of exploration
        self.running_list = []  # Temporary storage for new nodes
        self.history_list = []  # Track all generated nodes

    def _complete_(self, node):
        """Determine if the node represents a complete solution using the COMPLETE_PROMPT."""
        complete_prompt = COMPLETE_PROMPT.format(problem=self.problem_statement, par_str=node.value)
        complete = call_gemini(complete_prompt, model_name=self.model_name)
        return complete

    def reset_running_list(self):
        """Clear the temporary node storage"""
        self.running_list = []

    def expand(self, node, width):
        """
        Generate 'width' number of new solution steps from given node
        using the language model and evaluate their quality
        """
        for _ in range(width):
            step_prompt = STEP_PROMPT.format(problem=node.question, par_str=node.value)
            step = call_gemini(step_prompt, model_name=self.model_name, temperature=self.paras["temperature"])

            new_out = node.value + "\n" + step
            reward_prompt = REWARD_PROMPT.format(problem=node.question, par_str=new_out, verification_prompt=self.verification_prompt)
            reward_output = call_gemini(reward_prompt, self.model_name)

            # Parse reward and ensure it's an integer between -100 and 100
            try:
                score = int(reward_output.split('Score:')[-1].strip())
                score = max(-100, min(100, score))  # Clamp between -100 and 100
            except:
                score = -100

            new_node = TreeNode(len(self.nodes), node.question, new_out, score, num_step_tokens=len(new_out), parent=node)
            new_node.check_leaf(self)
            self.running_list.append(new_node)
            self.history_list.append(new_node)

    def insert(self, state, parent):
        """Add a new node to the tree structure"""
        new_node = TreeNode(len(self.nodes), state.question, state.value, state.score, state.num_step_tokens, parent)
        new_node.check_leaf(self)
        self.nodes.append(new_node)
        self.depth_nodes[new_node.depth].append(new_node)

    def select_and_expand(self, depth):
        """
        Select promising nodes at current depth and expand them.
        Returns False if no expansion is possible.
        """
        candidate_nodes = [node for node in self.depth_nodes[depth] if not node.is_leaf()]

        if not candidate_nodes or self.remaining_width <= 0:
            # print("No expansion possible")
            return False

        sorted_nodes = sorted(candidate_nodes, key=lambda x: x.get_score(), reverse=True)
        selected_nodes = sorted_nodes[:self.remaining_width]

        for node in selected_nodes:
            self.expand(node, width=1)
            self.remaining_width -= 1
            if self.remaining_width <= 0:
                break
        return True

    def print_tree(self):
        """Print the entire tree structure"""
        def print_node(node, indent=""):
            print(f"{indent}Node ID: {node.id}, Score: {node.score}, Depth: {node.depth}, Leaf: {node.is_leaf()}")
            for child in [n for n in self.nodes if n.parent == node]:
                print_node(child, indent + "  ")

        print_node(self.root)


class Rebase:
    """Main class for implementing reward-guided solution search"""
    def __init__(self, model_name, paras):
        self.model_name = model_name
        self.paras = paras

    def reward_guided_search(self, problem_statement, verification_prompt):
        """
        Search for solution using tree exploration guided by reward scores.
        Returns the first complete solution found or None if none exists.
        """
        tree = Tree(problem_statement, verification_prompt, self.model_name, self.paras)
        depth = 0
        while True:
            tree.reset_running_list()
            if not tree.select_and_expand(depth):
                print("No more expansion possible, ending search")
                break

            for state in tree.running_list:
                parent = state.parent
                tree.insert(state, parent)

            depth += 1
            if depth >= self.paras.get("max_depth", 25):
                print("Reached maximum depth")
                break

        # tree.print_tree()
        complete_solutions = [node for node in tree.nodes if node.is_leaf()]
        print(f"\nFound {len(complete_solutions)} complete solutions")
        return complete_solutions[0] if complete_solutions else None

# Implementation of PlanGEN (Mixture of Algorithms)

The Mixture of Algorithms framework introduces a selection agent which dynamically selects the best possible inference-time algorithms proposed in the above sections based on instance-level complexity. The framework operates in a modular and iterative manner, ensuring adaptability in addressing planning and reasoning problems with different complexity effectively.

> **Orchestration** The process begins with generating an initial plan using LLM based on the task description and problem statement. Along with this, the constraint agent is employed to generate an instance-specific set of constraints. Based on the constraints, the verification agent evaluates the quality of the initial plan and provides a reward score. If the initial plan meets the required threshold, it is acceptable as the "Final Plan". Otherwise, the iterative refinement process begins.

> **Iterative Refinement** During this iterative refinement, the selection agent determines the most suitable algorithm based on the instance-specific complexity and historical UCB values. The selected algorithm produces an updated plan, which is then re-evaluated by the verification agent. To ensure continual improvement, the framework incorporates feedback generated by a verification agent that provides guidance, and this feedback loop enables the system to refine the plan incrementally.

## Verification Agent

**Import libraries**

In [None]:
from typing import Dict, Any
from functools import lru_cache

**Class for Verification Agent**

In [None]:
class VerifierAgent:
    def __init__(self, max_attempts: int = 3, base_temperature: float = 0.7):
        self.max_attempts = max_attempts
        self.base_temperature = base_temperature
        self.response_schema = {
            "type": "object",
            "properties": {
                "feedback": {"type": "string"},
                "verified": {"type": "boolean"}
            },
            "required": ["feedback", "verified"]
        }

    @property
    @lru_cache()
    def verifier_system_prompt(self) -> str:
        return """
        Provide a reward score between -100 and 100 for the plan steps quality, using very strict standards. Do not give a full score above 98. Ensure the reward reflects how well the plan steps contributes to progressing toward the correct solution.

<problem>
{problem}
</problem>

<plan>
{par_str}
</plan>

Consider below constraints while evaluating:
{verification_prompt}

Make sure to check all the constraints before giving the reward score.

Please provide a reward in the below format:
[step-by-step reasoning for the reward score]
Score: [Strictly provide the reward score as an integer between -100 and 100]
"""

    def verify(self, problem_statement: str, plan: str, verification_prompt: str, model_name: str) -> Dict[str, Any]:
        last_error = None

        prompt= self.verifier_system_prompt.format(problem=problem_statement, par_str=plan, verification_prompt=verification_prompt)
        reward_output= call_gemini(prompt, temperature=0, model_name=model_name)

        try:
            reward = int(reward_output.split('Score:')[-1].strip())
            reward = max(-100, min(100, reward))  # Clamp between -100 and 100
        except:
            reward = -100

        return {
            "feedback": reward_output,
            "score": reward,
            "verified": reward >= 95
        }

## Selection Agent

**Import libraries**

In [None]:
import sys
import math

**Class for Selection Agent**

In [None]:
class SelectionAgent:
    def __init__(self):
        self.algorithms = {
            "Best of N": self.best_of_n,
            "Rebase": self.rebase,
            "ToT": self.tree_of_thoughts
        }
        self.rebase_config = {
            "temperature": 0.7,
            "max_step_tokens": 256,
            "max_tokens": 1024,
            "select_method": "softmax",
            "num_threads": 8,
            "softmax_temperature": 0.2,
            "reward_model_type": "llemma",
            "policy_model_type": "llemma",
            "model_name": model_name,
            "store_path": "./exp_results/rebase_16/",
            "width": 10
        }

        # Multi-Armed Bandit initialization
        self.rewards = {algo: 0 for algo in self.algorithms.keys()}
        self.counts = {algo: 1 for algo in self.algorithms.keys()}  # Avoid division by zero
        self.total_trials = 0
        self.lambda_prior_base = 10  # Initial scaling for LLM prior
        self.alpha_diversity = 1  # Scaling factor for diversity bonus
        self.alpha_recovery = 1  # Scaling factor for recovery adjustments
        self.recovery_scores = {algo: 0 for algo in self.algorithms.keys()}  # Recovery performance
        self.failures = {algo: 0 for algo in self.algorithms.keys()}  # Failure counts
        self.diversity_bonus = {algo: 1 for algo in self.algorithms.keys()}  # Diversity scores

    def select_algorithm(self, problem_statement, feedback, previous_plan=None, model_name=None):
        """
        Select the best algorithm based on the revised UCB equation.
        """
        # Decay lambda_prior dynamically
        lambda_prior = self.lambda_prior_base / (1 + self.total_trials)
        max_exploration = 5  # Cap for exploration term

        # Get LLM-guided priors
        llm_prior, exp_llm = self.llm_guided_prior(problem_statement, feedback, previous_plan, model_name)

        # Normalize rewards
        max_reward = max(self.rewards.values()) if max(self.rewards.values()) > 0 else 1

        # UCB selection policy
        ucb_values = {}
        for algo in self.algorithms.keys():
            avg_reward = self.rewards[algo] / self.counts[algo]
            normalized_reward = avg_reward / max_reward
            exploration = min(math.sqrt(2 * math.log(self.total_trials + 1) / self.counts[algo]), max_exploration)

            # Diversity bonus (penalize overused algorithms)
            diversity_bonus = self.alpha_diversity / (self.counts[algo] + 1)

            # Recovery score adjustment (reward algorithms that recover well after failures)
            recovery_bonus = self.alpha_recovery * self.recovery_scores[algo]

            # Compute UCB with all terms
            ucb_values[algo] = (
                normalized_reward
                + exploration
                + lambda_prior * llm_prior.get(algo, 0)
                + diversity_bonus
                + recovery_bonus
            )

        selected_algorithm = max(ucb_values, key=ucb_values.get)
        print(f"Selected Algorithm: {selected_algorithm} (UCB: {ucb_values[selected_algorithm]:.4f})")

        return self.algorithms[selected_algorithm], selected_algorithm, (llm_prior, exp_llm), ucb_values

    def llm_guided_prior(self, problem_statement, feedback, previous_plan=None, model_name="gemini-1.5-pro-002"):
        """
        Use LLM to guide initial reward estimates or provide adjustments to UCB scores.
        """
        context = f" and improve upon the previous plan: '{previous_plan}'" if previous_plan else ""
        prompt = (
    f"Analyze the following planning problem and explain your reasoning for assigning priority scores to the algorithms "
    f"based on their suitability. Scores should be between 0 and 1, where 1 represents the most suitable algorithm for "
    f"the given problem:\n\n"
    f"Problem Statement: '{problem_statement}'\n"
    f"Requirements: {feedback}\n"
    f"Context: {context if context else 'None provided'}\n\n"
    "Start by providing a brief reasoning for each algorithm's suitability based on problem complexity. Then, ONLY output your response strictly as a list with the exact format below.\n"
    "Reasoning:\n"
    "- Best of N: [Explain why this algorithm is or isn’t suitable]\n"
    "- Rebase: [Explain why this algorithm is or isn’t suitable]\n"
    "- ToT: [Explain why this algorithm is or isn’t suitable]\n\n"
    "Scores:\n"
    '[\n'
    '    ("Best of N", float),  # Only the score between 0 and 1\n'
    '    ("Rebase", float),     # Only the score between 0 and 1\n'
    '    ("ToT", float)         # Only the score between 0 and 1\n'
    ']'
)

        response_full = call_gemini(prompt, model_name=model_name)
        try:
            # Parse LLM response into a list of tuples
            response = response_full.split("Scores:")[1].strip()
            scores_list = eval(response)
            # Convert list of tuples into a dictionary
            scores = {algo: score for algo, score in scores_list}
            return scores, response_full
        except Exception as e:
            print(f"Error parsing LLM response: {e}")
            return {algo: 0 for algo in self.algorithms.keys()}, response_full  # Default to zero scores

    def register_failure(self, algorithm):
        """
        Register a failure for an algorithm and adjust its recovery score.
        """
        self.failures[algorithm] += 1
        # Update recovery score: penalize frequent failures, but allow for recovery
        self.recovery_scores[algorithm] = 1 / (self.failures[algorithm] + 1)

    def update_rewards(self, algorithm, reward):
        self.rewards[algorithm] += reward
        self.counts[algorithm] += 1
        self.total_trials += 1

    def best_of_n(self, problem_statement, verification_prompt, model_name=None):
        # Using SelfConsistencyEvaluator for Best of N
        evaluator = SelfConsistencyEvaluator(verification_prompt= verification_prompt, model_name=model_name)

        try:
            _, best_answer = evaluator.get_best_response(problem_statement, no_of_times=5)
        except Exception as e:
            print(f"Error during Best of N run: {e}")
            best_answer = ""

        return {"plan": best_answer, "status": "refined"}

    def rebase(self, problem_statement, verification_prompt, model_name=None):

        # Initialize Rebase with the chosen model and parameters
        rebase_solver = Rebase(model_name=model_name, paras=self.rebase_config)

        try:
            complete_solutions = rebase_solver.reward_guided_search(problem_statement, verification_prompt=verification_prompt)
            if complete_solutions:
                best_answer = complete_solutions.value if hasattr(complete_solutions, 'value') else str(complete_solutions)
            else:
                best_answer = ''
        except Exception as e:
            print(f"Error during ToT run: {e}")
            best_answer = ''

        return {"plan": best_answer, "status": "refined"}

    def tree_of_thoughts(self, problem_statement, verification_prompt, model_name=None):
        # Using SelfConsistencyEvaluator for sampling
        bfs_solver = BFS_sbs(
                problem=problem_statement,
                verification_prompt=verification_prompt,
                no_samples=3,
                model_name=model_name
            )

        try:
            best_answer, _ = bfs_solver.run()
        except Exception as e:
            print(f"Error during ToT run: {e}")
            best_answer = ""

        return {"plan": best_answer, "status": "refined"}

## Orchastration Unit

**Import libraries**

In [None]:
from os.path import join

**Classes for Orchastration Unit**

In [None]:
class Gemini:
    def create_initial_plan(self, problem_description, model_name):
        initial_plan = call_gemini(problem_description, temperature=0.0, model_name=model_name)
        return initial_plan

class OrchastrationUnit:
    def __init__(self):
        self.gemini = Gemini()
        self.verifier = VerifierAgent()
        self.selection_agent = SelectionAgent()

    def execute(self, problem_description, verification_prompt=None, model_name=None):
        print("\n=== Starting Orchestration Execution ===")

        # Check if problem_description is list or string
        if isinstance(problem_description, str):
            initial_prompt = enhanced_prompt = problem_description
        elif isinstance(problem_description, list):
            initial_prompt = problem_description[0]
            enhanced_prompt = problem_description[1]
        else:
            raise ValueError("problem_description must be either string or list")

        feedback_lst= []

        print("\n--- Step 1: Creating Initial Plan ---")
        plan = self.gemini.create_initial_plan(initial_prompt, model_name=model_name)

        print("\n--- Step 2: Getting Initial Verification ---")
        feedback = self.verifier.verify(initial_prompt, plan, verification_prompt, model_name=model_name)
        feedback["verified"] = False
        feedback_lst.append(feedback)

        iteration = 1
        max_iterations = 3
        selected_algorithm_lst, priors_lst, ucb_values_lst, all_plans = [], [], [], []

        all_plans.append((plan, feedback["score"]))

        while not feedback["verified"] and iteration <= max_iterations:
            print(f"\n--- Refinement Iteration {iteration} ---")

            selected_algorithm, algorithm_name, priors, ucb_values = self.selection_agent.select_algorithm(
            initial_prompt if isinstance(problem_description, list) else problem_description,
            feedback["feedback"],
            previous_plan=plan,
            model_name=model_name
            )

            if selected_algorithm:
                print(f"\n--- Step 3: Running {algorithm_name} Algorithm ---")
                selected_algorithm_lst.append(algorithm_name)
                priors_lst.append(priors)
                ucb_values_lst.append(ucb_values)

                # Determine base prompt based on algorithm type
                if selected_algorithm.__name__ in ['mcts_with_refine', 'best_of_n']:
                    base_prompt = initial_prompt if isinstance(problem_description, list) else problem_description
                else:  # rebase and ToT
                    base_prompt = enhanced_prompt if isinstance(problem_description, list) else problem_description

                enhanced_problem = f"Previous plan:\n{plan}\nConsider this feedback while generating updated plan: {feedback['feedback']}\n\n{base_prompt}\n\nBased on the feedback, start generating the plan again from the begining following above prompt."
                plan = selected_algorithm(enhanced_problem, verification_prompt, model_name=model_name)
                feedback = self.verifier.verify(enhanced_problem, plan, verification_prompt, model_name=model_name)
                feedback_lst.append(feedback)

                # Update bandit rewards or register failure
                if not feedback["verified"]:
                    self.selection_agent.register_failure(algorithm_name)
                else:
                    self.selection_agent.update_rewards(algorithm_name, feedback["score"])

                # Update bandit rewards
                # self.selection_agent.update_rewards(algorithm_name, feedback["score"])
                all_plans.append((plan, feedback["score"]))

            iteration += 1

        if iteration > max_iterations:
            # Select the plan with the highest score
            plan = max(all_plans, key=lambda x: x[1])[0]
            selected_algorithm_lst.append("\n=== Maximum iterations reached without inference-time compute ===")

        return plan, selected_algorithm_lst, {"priors": priors_lst, "ucb_values": ucb_values_lst}, feedback_lst, all_plans

# Example Tutorial on Running All PlanGEN Frameworks on NATURAL PLAN

> This tutorial provides example run on two tasks from the NATURAL PLAN benchmark: Calendar Scheduling and Trip Planning. For each task, only three instances were evaluated, focusing on the frameworks' ability to generate both a step-by-step plan (`model_plan`) and a final solution (`model_prediction`). All generated predictions were saved in a `final_prediction` list.

### Load examples for calendar scheduling

Data Template:
```
{
    'num_people': '<number of participants>',
    'num_days': '<number of days>',
    'duration': '<meeting duration in hours or minutes>',
    'prompt_5shot': "<detailed prompt with multiple examples to guide the scheduling system>",
    'golden_plan': "<expected correct scheduling outcome>",
    'pred_5shot_pro': "<model-generated scheduling prediction>",
    'prompt_0shot': "<shorter prompt with minimal or no examples, testing model’s ability to generalize>",
    'id': '<unique identifier for the scheduling task>'
}
```

In [None]:
data_calendar = [{'num_people': '3',
  'num_days': '1',
  'duration': '1',
  'prompt_5shot': "You are an expert at scheduling meetings. You are given a few constraints on the existing schedule of each participant, the meeting duration, and possibly some preferences on the meeting time. Note there exists a solution that works with existing schedule of every participant. Here are a few example tasks and solutions:\n\nTASK: You need to schedule a meeting for Roger, Karen and Dorothy for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRoger's calendar is wide open the entire day.\nKaren has meetings on Monday during 10:00 to 10:30, 11:30 to 12:00, 12:30 to 13:00, 14:00 to 15:00, 15:30 to 16:00; \nDorothy is busy on Monday during 9:00 to 10:00, 10:30 to 11:00, 11:30 to 12:00, 12:30 to 13:00, 14:00 to 15:30, 16:00 to 17:00; \n\nYou would like to schedule the meeting at their earlist availability.\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 11:00 - 11:30 \n\nTASK: You need to schedule a meeting for Douglas, Lawrence and Isabella for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nDouglas has meetings on Monday during 12:00 to 12:30, 15:00 to 15:30; \nLawrence has meetings on Monday during 10:30 to 12:00, 13:00 to 13:30, 14:00 to 14:30, 15:30 to 16:00; \nIsabella is busy on Monday during 9:00 to 12:30, 13:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 12:30 - 13:00 \n\nTASK: You need to schedule a meeting for Joshua, Denise and Jeremy for one hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nJoshua is busy on Monday during 10:00 to 10:30, 12:00 to 12:30, 14:00 to 14:30, 15:00 to 15:30; \nDenise's calendar is wide open the entire day.\nJeremy has meetings on Monday during 9:30 to 10:30, 12:00 to 13:00, 13:30 to 14:00, 14:30 to 15:00, 15:30 to 16:00, 16:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 10:30 - 11:30 \n\nTASK: You need to schedule a meeting for Alan, Elizabeth and Denise for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nAlan has blocked their calendar on Monday during 13:00 to 13:30, 14:00 to 14:30; \nElizabeth is busy on Monday during 9:00 to 9:30, 11:00 to 11:30; \nDenise has blocked their calendar on Monday during 9:00 to 10:00, 10:30 to 11:30, 12:30 to 13:00, 13:30 to 14:00, 14:30 to 16:00, 16:30 to 17:00; \n\nYou would like to schedule the meeting at their earlist availability.\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 10:00 - 10:30 \n\nTASK: You need to schedule a meeting for Mason, Bruce and Christopher for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nMason has meetings on Monday during 9:30 to 10:00, 11:00 to 11:30, 14:30 to 15:00, 16:30 to 17:00; \nBruce is free the entire day.\nChristopher is busy on Monday during 9:30 to 10:30, 11:30 to 12:30, 15:00 to 17:00; \n\nMason would rather not meet on Monday before 12:30. Find a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 12:30 - 13:00 \n\nTASK: You need to schedule a meeting for Michelle, Steven and Jerry for one hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nMichelle has meetings on Monday during 11:00 to 12:00; \nSteven has blocked their calendar on Monday during 9:00 to 9:30, 11:30 to 12:00, 13:30 to 14:00, 15:30 to 16:00; \nJerry has blocked their calendar on Monday during 9:00 to 9:30, 10:00 to 11:00, 11:30 to 12:30, 13:00 to 14:30, 15:30 to 16:00, 16:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: ",
  'golden_plan': 'Here is the proposed time: Monday, 14:30 - 15:30 ',
  'pred_5shot_pro': 'SOLUTION: Here is the proposed time: Monday, 14:30 - 15:30 \n',
  'prompt_0shot': "You are an expert at scheduling meetings. You are given a few constraints on the existing schedule of each participant, the meeting duration, and possibly some preferences on the meeting time. Note there exists a solution that works with existing schedule of every participant. Here are a few example tasks and solutions:\n\nTASK: You need to schedule a meeting for Michelle, Steven and Jerry for one hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nMichelle has meetings on Monday during 11:00 to 12:00; \nSteven has blocked their calendar on Monday during 9:00 to 9:30, 11:30 to 12:00, 13:30 to 14:00, 15:30 to 16:00; \nJerry has blocked their calendar on Monday during 9:00 to 9:30, 10:00 to 11:00, 11:30 to 12:30, 13:00 to 14:30, 15:30 to 16:00, 16:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: ",
  'id': 'calendar_scheduling_example_0'},
 {'num_people': '3',
  'num_days': '1',
  'duration': '0.5',
  'prompt_5shot': "You are an expert at scheduling meetings. You are given a few constraints on the existing schedule of each participant, the meeting duration, and possibly some preferences on the meeting time. Note there exists a solution that works with existing schedule of every participant. Here are a few example tasks and solutions:\n\nTASK: You need to schedule a meeting for Roger, Karen and Dorothy for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRoger's calendar is wide open the entire day.\nKaren has meetings on Monday during 10:00 to 10:30, 11:30 to 12:00, 12:30 to 13:00, 14:00 to 15:00, 15:30 to 16:00; \nDorothy is busy on Monday during 9:00 to 10:00, 10:30 to 11:00, 11:30 to 12:00, 12:30 to 13:00, 14:00 to 15:30, 16:00 to 17:00; \n\nYou would like to schedule the meeting at their earlist availability.\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 11:00 - 11:30 \n\nTASK: You need to schedule a meeting for Douglas, Lawrence and Isabella for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nDouglas has meetings on Monday during 12:00 to 12:30, 15:00 to 15:30; \nLawrence has meetings on Monday during 10:30 to 12:00, 13:00 to 13:30, 14:00 to 14:30, 15:30 to 16:00; \nIsabella is busy on Monday during 9:00 to 12:30, 13:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 12:30 - 13:00 \n\nTASK: You need to schedule a meeting for Joshua, Denise and Jeremy for one hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nJoshua is busy on Monday during 10:00 to 10:30, 12:00 to 12:30, 14:00 to 14:30, 15:00 to 15:30; \nDenise's calendar is wide open the entire day.\nJeremy has meetings on Monday during 9:30 to 10:30, 12:00 to 13:00, 13:30 to 14:00, 14:30 to 15:00, 15:30 to 16:00, 16:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 10:30 - 11:30 \n\nTASK: You need to schedule a meeting for Alan, Elizabeth and Denise for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nAlan has blocked their calendar on Monday during 13:00 to 13:30, 14:00 to 14:30; \nElizabeth is busy on Monday during 9:00 to 9:30, 11:00 to 11:30; \nDenise has blocked their calendar on Monday during 9:00 to 10:00, 10:30 to 11:30, 12:30 to 13:00, 13:30 to 14:00, 14:30 to 16:00, 16:30 to 17:00; \n\nYou would like to schedule the meeting at their earlist availability.\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 10:00 - 10:30 \n\nTASK: You need to schedule a meeting for Mason, Bruce and Christopher for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nMason has meetings on Monday during 9:30 to 10:00, 11:00 to 11:30, 14:30 to 15:00, 16:30 to 17:00; \nBruce is free the entire day.\nChristopher is busy on Monday during 9:30 to 10:30, 11:30 to 12:30, 15:00 to 17:00; \n\nMason would rather not meet on Monday before 12:30. Find a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 12:30 - 13:00 \n\nTASK: You need to schedule a meeting for Raymond, Billy and Donald for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRaymond has blocked their calendar on Monday during 9:00 to 9:30, 11:30 to 12:00, 13:00 to 13:30, 15:00 to 15:30; \nBilly has meetings on Monday during 10:00 to 10:30, 12:00 to 13:00, 16:30 to 17:00; \nDonald has meetings on Monday during 9:00 to 9:30, 10:00 to 11:00, 12:00 to 13:00, 14:00 to 14:30, 16:00 to 17:00; \n\nBilly would like to avoid more meetings on Monday after 15:00. Find a time that works for everyone's schedule and constraints. \nSOLUTION: ",
  'golden_plan': 'Here is the proposed time: Monday, 14:30 - 15:00 ',
  'pred_5shot_pro': 'Here is the proposed time: Monday, 13:30 - 14:00 \n',
  'prompt_0shot': "You are an expert at scheduling meetings. You are given a few constraints on the existing schedule of each participant, the meeting duration, and possibly some preferences on the meeting time. Note there exists a solution that works with existing schedule of every participant. Here are a few example tasks and solutions:\n\nTASK: You need to schedule a meeting for Raymond, Billy and Donald for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRaymond has blocked their calendar on Monday during 9:00 to 9:30, 11:30 to 12:00, 13:00 to 13:30, 15:00 to 15:30; \nBilly has meetings on Monday during 10:00 to 10:30, 12:00 to 13:00, 16:30 to 17:00; \nDonald has meetings on Monday during 9:00 to 9:30, 10:00 to 11:00, 12:00 to 13:00, 14:00 to 14:30, 16:00 to 17:00; \n\nBilly would like to avoid more meetings on Monday after 15:00. Find a time that works for everyone's schedule and constraints. \nSOLUTION: ",
  'id': 'calendar_scheduling_example_1'},
 {'num_people': '3',
  'num_days': '1',
  'duration': '0.5',
  'prompt_5shot': "You are an expert at scheduling meetings. You are given a few constraints on the existing schedule of each participant, the meeting duration, and possibly some preferences on the meeting time. Note there exists a solution that works with existing schedule of every participant. Here are a few example tasks and solutions:\n\nTASK: You need to schedule a meeting for Roger, Karen and Dorothy for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRoger's calendar is wide open the entire day.\nKaren has meetings on Monday during 10:00 to 10:30, 11:30 to 12:00, 12:30 to 13:00, 14:00 to 15:00, 15:30 to 16:00; \nDorothy is busy on Monday during 9:00 to 10:00, 10:30 to 11:00, 11:30 to 12:00, 12:30 to 13:00, 14:00 to 15:30, 16:00 to 17:00; \n\nYou would like to schedule the meeting at their earlist availability.\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 11:00 - 11:30 \n\nTASK: You need to schedule a meeting for Douglas, Lawrence and Isabella for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nDouglas has meetings on Monday during 12:00 to 12:30, 15:00 to 15:30; \nLawrence has meetings on Monday during 10:30 to 12:00, 13:00 to 13:30, 14:00 to 14:30, 15:30 to 16:00; \nIsabella is busy on Monday during 9:00 to 12:30, 13:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 12:30 - 13:00 \n\nTASK: You need to schedule a meeting for Joshua, Denise and Jeremy for one hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nJoshua is busy on Monday during 10:00 to 10:30, 12:00 to 12:30, 14:00 to 14:30, 15:00 to 15:30; \nDenise's calendar is wide open the entire day.\nJeremy has meetings on Monday during 9:30 to 10:30, 12:00 to 13:00, 13:30 to 14:00, 14:30 to 15:00, 15:30 to 16:00, 16:30 to 17:00; \n\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 10:30 - 11:30 \n\nTASK: You need to schedule a meeting for Alan, Elizabeth and Denise for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nAlan has blocked their calendar on Monday during 13:00 to 13:30, 14:00 to 14:30; \nElizabeth is busy on Monday during 9:00 to 9:30, 11:00 to 11:30; \nDenise has blocked their calendar on Monday during 9:00 to 10:00, 10:30 to 11:30, 12:30 to 13:00, 13:30 to 14:00, 14:30 to 16:00, 16:30 to 17:00; \n\nYou would like to schedule the meeting at their earlist availability.\nFind a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 10:00 - 10:30 \n\nTASK: You need to schedule a meeting for Mason, Bruce and Christopher for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nMason has meetings on Monday during 9:30 to 10:00, 11:00 to 11:30, 14:30 to 15:00, 16:30 to 17:00; \nBruce is free the entire day.\nChristopher is busy on Monday during 9:30 to 10:30, 11:30 to 12:30, 15:00 to 17:00; \n\nMason would rather not meet on Monday before 12:30. Find a time that works for everyone's schedule and constraints. \nSOLUTION: Here is the proposed time: Monday, 12:30 - 13:00 \n\nTASK: You need to schedule a meeting for Roy, Kathryn and Amy for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRoy has blocked their calendar on Monday during 9:00 to 9:30, 10:00 to 10:30, 11:00 to 11:30, 12:30 to 13:00; \nKathryn has meetings on Monday during 9:30 to 10:00, 16:30 to 17:00; \nAmy has blocked their calendar on Monday during 9:00 to 14:30, 15:00 to 16:00, 16:30 to 17:00; \n\nAmy would rather not meet on Monday after 15:30. Find a time that works for everyone's schedule and constraints. \nSOLUTION: ",
  'golden_plan': 'Here is the proposed time: Monday, 14:30 - 15:00 ',
  'pred_5shot_pro': 'Here is the proposed time: Monday, 14:30 - 15:00 \n',
  'prompt_0shot': "You are an expert at scheduling meetings. You are given a few constraints on the existing schedule of each participant, the meeting duration, and possibly some preferences on the meeting time. Note there exists a solution that works with existing schedule of every participant. Here are a few example tasks and solutions:\n\nTASK: You need to schedule a meeting for Roy, Kathryn and Amy for half an hour between the work hours of 9:00 to 17:00 on Monday. \n\nHere are the existing schedules for everyone during the day: \nRoy has blocked their calendar on Monday during 9:00 to 9:30, 10:00 to 10:30, 11:00 to 11:30, 12:30 to 13:00; \nKathryn has meetings on Monday during 9:30 to 10:00, 16:30 to 17:00; \nAmy has blocked their calendar on Monday during 9:00 to 14:30, 15:00 to 16:00, 16:30 to 17:00; \n\nAmy would rather not meet on Monday after 15:30. Find a time that works for everyone's schedule and constraints. \nSOLUTION: ",
  'id': 'calendar_scheduling_example_2'}]

### Load examples for trip planning

Data Template:
```
{
    'num_cities': '<number of cities>',
    'cities': '<city1**city2**city3**...>',
    'durations': '<duration1**duration2**duration3**...>',
    'total_days': '<total number of days>',
    'flight_routes': '<direct flight routes between cities>',
    'prompt_0shot': "You plan to visit <num_cities> cities for <total_days> days in total.
        You only take direct flights to commute between cities.
        You would like to visit <city1> for <duration1> days.
        You would like to visit <city2> for <duration2> days.
        You would like to visit <city3> for <duration3> days.
        Here are the cities that have direct flights: <flight_routes>.
        Find a trip plan of visiting the cities for <total_days> days by taking direct flights to commute between them.",
    'prompt_5shot': "You are an expert at planning trips.
        You are given a few constraints regarding the cities to visit, the durations of stay,
        and the available direct flights between the cities.
        Here are a few example tasks and solutions:\n\nTASK: <Example 1>\nSOLUTION: <Solution 1>\n\nTASK: <Example 2>\nSOLUTION: <Solution 2>\n\nTASK: <Example 3>\nSOLUTION: <Solution 3>\n\nTASK: You plan to visit <num_cities> cities for <total_days> days in total...\nSOLUTION:",
    'golden_plan': "Here is the trip plan for visiting <num_cities> cities for <total_days> days:\n\n**Day X-Y:** Arriving in <city1> and visit <city1> for <duration1> days.\n**Day Y:** Fly from <city1> to <city2>.\n**Day Y-Z:** Visit <city2> for <duration2> days.\n**Day Z:** Fly from <city2> to <city3>.\n**Day Z-T:** Visit <city3> for <duration3> days.",
    'pred_5shot_pro': "<Model-generated trip itinerary>",
    'id': '<unique_identifier>'
}

```

In [None]:
data_trip= [{'num_cities': '3',
  'cities': 'Helsinki**Barcelona**Florence',
  'durations': '5**5**6',
  'prompt_0shot': 'You plan to visit 3 European cities for 14 days in total. You only take direct flights to commute between cities. You would like to visit Florence for 6 days. You want to meet a friend in Florence between day 9 and day 14. You would like to visit Barcelona for 5 days. You would like to visit Helsinki for 5 days.\n\nHere are the cities that have direct flights:\nBarcelona and Florence, Helsinki and Barcelona.\n\nFind a trip plan of visiting the cities for 14 days by taking direct flights to commute between them.',
  'prompt_5shot': 'You are an expert at planning trips. You are given a few constraints regarding the cities to visit and the durations of staying at each city. You are also given the flight information between the cities. Here are a few example tasks and solutions:\n\nTASK: You plan to visit 3 European cities for 7 days in total. You only take direct flights to commute between cities. You want to spend 4 days in Berlin. You would like to visit Venice for 3 days. You are going to attend a wedding in Venice between day 5 and day 7. You want to spend 2 days in Tallinn.\n\nHere are the cities that have direct flights:\nBerlin and Venice, Tallinn and Berlin.\n\nFind a trip plan of visiting the cities for 7 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 7 days:\n\n**Day 1-2:** Arriving in Tallinn and visit Tallinn for 2 days.\n**Day 2:** Fly from Tallinn to Berlin.\n**Day 2-5:** Visit Berlin for 4 days.\n**Day 5:** Fly from Berlin to Venice.\n**Day 5-7:** Visit Venice for 3 days.\n\nTASK: You plan to visit 3 European cities for 14 days in total. You only take direct flights to commute between cities. You plan to stay in London for 4 days. You plan to stay in Bucharest for 5 days. You want to meet a friend in Bucharest between day 10 and day 14. You would like to visit Reykjavik for 7 days.\n\nHere are the cities that have direct flights:\nLondon and Bucharest, Reykjavik and London.\n\nFind a trip plan of visiting the cities for 14 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 14 days:\n\n**Day 1-7:** Arriving in Reykjavik and visit Reykjavik for 7 days.\n**Day 7:** Fly from Reykjavik to London.\n**Day 7-10:** Visit London for 4 days.\n**Day 10:** Fly from London to Bucharest.\n**Day 10-14:** Visit Bucharest for 5 days.\n\nTASK: You plan to visit 3 European cities for 12 days in total. You only take direct flights to commute between cities. You want to spend 4 days in Seville. You plan to stay in Valencia for 5 days. You have to attend a workshop in Valencia between day 8 and day 12. You plan to stay in Manchester for 5 days.\n\nHere are the cities that have direct flights:\nManchester and Seville, Seville and Valencia.\n\nFind a trip plan of visiting the cities for 12 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 12 days:\n\n**Day 1-5:** Arriving in Manchester and visit Manchester for 5 days.\n**Day 5:** Fly from Manchester to Seville.\n**Day 5-8:** Visit Seville for 4 days.\n**Day 8:** Fly from Seville to Valencia.\n**Day 8-12:** Visit Valencia for 5 days.\n\nTASK: You plan to visit 3 European cities for 8 days in total. You only take direct flights to commute between cities. You plan to stay in Istanbul for 2 days. You want to spend 3 days in Venice. You have to attend a workshop in Venice between day 1 and day 3. You would like to visit Valencia for 5 days.\n\nHere are the cities that have direct flights:\nVenice and Istanbul, Istanbul and Valencia.\n\nFind a trip plan of visiting the cities for 8 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 8 days:\n\n**Day 1-3:** Arriving in Venice and visit Venice for 3 days.\n**Day 3:** Fly from Venice to Istanbul.\n**Day 3-4:** Visit Istanbul for 2 days.\n**Day 4:** Fly from Istanbul to Valencia.\n**Day 4-8:** Visit Valencia for 5 days.\n\nTASK: You plan to visit 3 European cities for 11 days in total. You only take direct flights to commute between cities. You want to spend 2 days in Porto. You want to spend 4 days in Oslo. You would like to visit Split for 7 days. You plan to visit relatives in Split between day 5 and day 11.\n\nHere are the cities that have direct flights:\nPorto and Oslo, Oslo and Split.\n\nFind a trip plan of visiting the cities for 11 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 11 days:\n\n**Day 1-2:** Arriving in Porto and visit Porto for 2 days.\n**Day 2:** Fly from Porto to Oslo.\n**Day 2-5:** Visit Oslo for 4 days.\n**Day 5:** Fly from Oslo to Split.\n**Day 5-11:** Visit Split for 7 days.\n\nTASK: You plan to visit 3 European cities for 14 days in total. You only take direct flights to commute between cities. You would like to visit Florence for 6 days. You want to meet a friend in Florence between day 9 and day 14. You would like to visit Barcelona for 5 days. You would like to visit Helsinki for 5 days.\n\nHere are the cities that have direct flights:\nBarcelona and Florence, Helsinki and Barcelona.\n\nFind a trip plan of visiting the cities for 14 days by taking direct flights to commute between them.\nSOLUTION:',
  'golden_plan': 'Here is the trip plan for visiting the 3 European cities for 14 days:\n\n**Day 1-5:** Arriving in Helsinki and visit Helsinki for 5 days.\n**Day 5:** Fly from Helsinki to Barcelona.\n**Day 5-9:** Visit Barcelona for 5 days.\n**Day 9:** Fly from Barcelona to Florence.\n**Day 9-14:** Visit Florence for 6 days.',
  'pred_5shot_pro': 'Here is the trip plan for visiting the 3 European cities for 14 days:\n\n**Day 1-5:** Arriving in Helsinki and visit Helsinki for 5 days.\n**Day 5:** Fly from Helsinki to Barcelona.\n**Day 5-9:** Visit Barcelona for 5 days.\n**Day 9:** Fly from Barcelona to Florence.\n**Day 9-14:** Visit Florence for 6 days. \n',
  'id': 'trip_planning_example_0'},
 {'num_cities': '3',
  'cities': 'Riga**Manchester**Split',
  'durations': '7**4**6',
  'prompt_0shot': 'You plan to visit 3 European cities for 15 days in total. You only take direct flights to commute between cities. You want to spend 6 days in Split. You want to spend 4 days in Manchester. You want to spend 7 days in Riga.\n\nHere are the cities that have direct flights:\nRiga and Manchester, from Manchester to Split.\n\nFind a trip plan of visiting the cities for 15 days by taking direct flights to commute between them.',
  'prompt_5shot': 'You are an expert at planning trips. You are given a few constraints regarding the cities to visit and the durations of staying at each city. You are also given the flight information between the cities. Here are a few example tasks and solutions:\n\nTASK: You plan to visit 3 European cities for 7 days in total. You only take direct flights to commute between cities. You want to spend 4 days in Berlin. You would like to visit Venice for 3 days. You are going to attend a wedding in Venice between day 5 and day 7. You want to spend 2 days in Tallinn.\n\nHere are the cities that have direct flights:\nBerlin and Venice, Tallinn and Berlin.\n\nFind a trip plan of visiting the cities for 7 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 7 days:\n\n**Day 1-2:** Arriving in Tallinn and visit Tallinn for 2 days.\n**Day 2:** Fly from Tallinn to Berlin.\n**Day 2-5:** Visit Berlin for 4 days.\n**Day 5:** Fly from Berlin to Venice.\n**Day 5-7:** Visit Venice for 3 days.\n\nTASK: You plan to visit 3 European cities for 14 days in total. You only take direct flights to commute between cities. You plan to stay in London for 4 days. You plan to stay in Bucharest for 5 days. You want to meet a friend in Bucharest between day 10 and day 14. You would like to visit Reykjavik for 7 days.\n\nHere are the cities that have direct flights:\nLondon and Bucharest, Reykjavik and London.\n\nFind a trip plan of visiting the cities for 14 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 14 days:\n\n**Day 1-7:** Arriving in Reykjavik and visit Reykjavik for 7 days.\n**Day 7:** Fly from Reykjavik to London.\n**Day 7-10:** Visit London for 4 days.\n**Day 10:** Fly from London to Bucharest.\n**Day 10-14:** Visit Bucharest for 5 days.\n\nTASK: You plan to visit 3 European cities for 12 days in total. You only take direct flights to commute between cities. You want to spend 4 days in Seville. You plan to stay in Valencia for 5 days. You have to attend a workshop in Valencia between day 8 and day 12. You plan to stay in Manchester for 5 days.\n\nHere are the cities that have direct flights:\nManchester and Seville, Seville and Valencia.\n\nFind a trip plan of visiting the cities for 12 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 12 days:\n\n**Day 1-5:** Arriving in Manchester and visit Manchester for 5 days.\n**Day 5:** Fly from Manchester to Seville.\n**Day 5-8:** Visit Seville for 4 days.\n**Day 8:** Fly from Seville to Valencia.\n**Day 8-12:** Visit Valencia for 5 days.\n\nTASK: You plan to visit 3 European cities for 8 days in total. You only take direct flights to commute between cities. You plan to stay in Istanbul for 2 days. You want to spend 3 days in Venice. You have to attend a workshop in Venice between day 1 and day 3. You would like to visit Valencia for 5 days.\n\nHere are the cities that have direct flights:\nVenice and Istanbul, Istanbul and Valencia.\n\nFind a trip plan of visiting the cities for 8 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 8 days:\n\n**Day 1-3:** Arriving in Venice and visit Venice for 3 days.\n**Day 3:** Fly from Venice to Istanbul.\n**Day 3-4:** Visit Istanbul for 2 days.\n**Day 4:** Fly from Istanbul to Valencia.\n**Day 4-8:** Visit Valencia for 5 days.\n\nTASK: You plan to visit 3 European cities for 11 days in total. You only take direct flights to commute between cities. You want to spend 2 days in Porto. You want to spend 4 days in Oslo. You would like to visit Split for 7 days. You plan to visit relatives in Split between day 5 and day 11.\n\nHere are the cities that have direct flights:\nPorto and Oslo, Oslo and Split.\n\nFind a trip plan of visiting the cities for 11 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 11 days:\n\n**Day 1-2:** Arriving in Porto and visit Porto for 2 days.\n**Day 2:** Fly from Porto to Oslo.\n**Day 2-5:** Visit Oslo for 4 days.\n**Day 5:** Fly from Oslo to Split.\n**Day 5-11:** Visit Split for 7 days.\n\nTASK: You plan to visit 3 European cities for 15 days in total. You only take direct flights to commute between cities. You want to spend 6 days in Split. You want to spend 4 days in Manchester. You want to spend 7 days in Riga.\n\nHere are the cities that have direct flights:\nRiga and Manchester, from Manchester to Split.\n\nFind a trip plan of visiting the cities for 15 days by taking direct flights to commute between them.\nSOLUTION:',
  'golden_plan': 'Here is the trip plan for visiting the 3 European cities for 15 days:\n\n**Day 1-7:** Arriving in Riga and visit Riga for 7 days.\n**Day 7:** Fly from Riga to Manchester.\n**Day 7-10:** Visit Manchester for 4 days.\n**Day 10:** Fly from Manchester to Split.\n**Day 10-15:** Visit Split for 6 days.',
  'pred_5shot_pro': 'Here is the trip plan for visiting the 3 European cities for 15 days:\n\n**Day 1-7:** Arriving in Riga and visit Riga for 7 days.\n**Day 7:** Fly from Riga to Manchester.\n**Day 7-10:** Visit Manchester for 4 days.\n**Day 10:** Fly from Manchester to Split.\n**Day 10-15:** Visit Split for 6 days. \n',
  'id': 'trip_planning_example_1'},
 {'num_cities': '3',
  'cities': 'Reykjavik**Vienna**Venice',
  'durations': '2**7**4',
  'prompt_0shot': 'You plan to visit 3 European cities for 11 days in total. You only take direct flights to commute between cities. You would like to visit Reykjavik for 2 days. You want to spend 7 days in Vienna. You would like to visit Venice for 4 days. You are going to attend a wedding in Venice between day 8 and day 11.\n\nHere are the cities that have direct flights:\nReykjavik and Vienna, Vienna and Venice.\n\nFind a trip plan of visiting the cities for 11 days by taking direct flights to commute between them.',
  'prompt_5shot': 'You are an expert at planning trips. You are given a few constraints regarding the cities to visit and the durations of staying at each city. You are also given the flight information between the cities. Here are a few example tasks and solutions:\n\nTASK: You plan to visit 3 European cities for 7 days in total. You only take direct flights to commute between cities. You want to spend 4 days in Berlin. You would like to visit Venice for 3 days. You are going to attend a wedding in Venice between day 5 and day 7. You want to spend 2 days in Tallinn.\n\nHere are the cities that have direct flights:\nBerlin and Venice, Tallinn and Berlin.\n\nFind a trip plan of visiting the cities for 7 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 7 days:\n\n**Day 1-2:** Arriving in Tallinn and visit Tallinn for 2 days.\n**Day 2:** Fly from Tallinn to Berlin.\n**Day 2-5:** Visit Berlin for 4 days.\n**Day 5:** Fly from Berlin to Venice.\n**Day 5-7:** Visit Venice for 3 days.\n\nTASK: You plan to visit 3 European cities for 14 days in total. You only take direct flights to commute between cities. You plan to stay in London for 4 days. You plan to stay in Bucharest for 5 days. You want to meet a friend in Bucharest between day 10 and day 14. You would like to visit Reykjavik for 7 days.\n\nHere are the cities that have direct flights:\nLondon and Bucharest, Reykjavik and London.\n\nFind a trip plan of visiting the cities for 14 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 14 days:\n\n**Day 1-7:** Arriving in Reykjavik and visit Reykjavik for 7 days.\n**Day 7:** Fly from Reykjavik to London.\n**Day 7-10:** Visit London for 4 days.\n**Day 10:** Fly from London to Bucharest.\n**Day 10-14:** Visit Bucharest for 5 days.\n\nTASK: You plan to visit 3 European cities for 12 days in total. You only take direct flights to commute between cities. You want to spend 4 days in Seville. You plan to stay in Valencia for 5 days. You have to attend a workshop in Valencia between day 8 and day 12. You plan to stay in Manchester for 5 days.\n\nHere are the cities that have direct flights:\nManchester and Seville, Seville and Valencia.\n\nFind a trip plan of visiting the cities for 12 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 12 days:\n\n**Day 1-5:** Arriving in Manchester and visit Manchester for 5 days.\n**Day 5:** Fly from Manchester to Seville.\n**Day 5-8:** Visit Seville for 4 days.\n**Day 8:** Fly from Seville to Valencia.\n**Day 8-12:** Visit Valencia for 5 days.\n\nTASK: You plan to visit 3 European cities for 8 days in total. You only take direct flights to commute between cities. You plan to stay in Istanbul for 2 days. You want to spend 3 days in Venice. You have to attend a workshop in Venice between day 1 and day 3. You would like to visit Valencia for 5 days.\n\nHere are the cities that have direct flights:\nVenice and Istanbul, Istanbul and Valencia.\n\nFind a trip plan of visiting the cities for 8 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 8 days:\n\n**Day 1-3:** Arriving in Venice and visit Venice for 3 days.\n**Day 3:** Fly from Venice to Istanbul.\n**Day 3-4:** Visit Istanbul for 2 days.\n**Day 4:** Fly from Istanbul to Valencia.\n**Day 4-8:** Visit Valencia for 5 days.\n\nTASK: You plan to visit 3 European cities for 11 days in total. You only take direct flights to commute between cities. You want to spend 2 days in Porto. You want to spend 4 days in Oslo. You would like to visit Split for 7 days. You plan to visit relatives in Split between day 5 and day 11.\n\nHere are the cities that have direct flights:\nPorto and Oslo, Oslo and Split.\n\nFind a trip plan of visiting the cities for 11 days by taking direct flights to commute between them.\nSOLUTION: Here is the trip plan for visiting the 3 European cities for 11 days:\n\n**Day 1-2:** Arriving in Porto and visit Porto for 2 days.\n**Day 2:** Fly from Porto to Oslo.\n**Day 2-5:** Visit Oslo for 4 days.\n**Day 5:** Fly from Oslo to Split.\n**Day 5-11:** Visit Split for 7 days.\n\nTASK: You plan to visit 3 European cities for 11 days in total. You only take direct flights to commute between cities. You would like to visit Reykjavik for 2 days. You want to spend 7 days in Vienna. You would like to visit Venice for 4 days. You are going to attend a wedding in Venice between day 8 and day 11.\n\nHere are the cities that have direct flights:\nReykjavik and Vienna, Vienna and Venice.\n\nFind a trip plan of visiting the cities for 11 days by taking direct flights to commute between them.\nSOLUTION:',
  'golden_plan': 'Here is the trip plan for visiting the 3 European cities for 11 days:\n\n**Day 1-2:** Arriving in Reykjavik and visit Reykjavik for 2 days.\n**Day 2:** Fly from Reykjavik to Vienna.\n**Day 2-8:** Visit Vienna for 7 days.\n**Day 8:** Fly from Vienna to Venice.\n**Day 8-11:** Visit Venice for 4 days.',
  'pred_5shot_pro': 'Here is the trip plan for visiting the 3 European cities for 11 days:\n\n**Day 1-2:** Arriving in Reykjavik and visit Reykjavik for 2 days.\n**Day 2:** Fly from Reykjavik to Vienna.\n**Day 2-8:** Visit Vienna for 7 days.\n**Day 8:** Fly from Vienna to Venice.\n**Day 8-11:** Visit Venice for 4 days. \n',
  'id': 'trip_planning_example_2'}]

# Run Inference on NATURAL PLAN using PlanGEN (Best of N)

**Import Libraries for Evaluation**

In [None]:
import os
from os.path import join
import json
from tqdm import tqdm
import time

### Prediction Loop for NATURAL PLAN (Calendar Scheduling)

> Run the prediction loop for calendar scheduling task and save in `final_predictions` list

In [None]:
final_predictions = []

for idx, data_point in tqdm(enumerate(data_calendar), desc=f"Processing calendar with PlanGEN (Best of N)", total=len(data_calendar)):

  replace_str= "Plan to Find the Right Slot:\r\n\r\n1. Analyze Constraints: Carefully review all constraints provided in the problem, including participant schedules, meeting duration, and any specific preferences or restrictions.\r\n\r\n2. Identify Individual Availability: For each participant, list all their free time slots during the specified work hours.\r\n\r\n3. Determine Common Availability: Find the overlapping free time slots (intersection of availability) across all participants.\r\n\r\n4. Prioritize Earliest Time Slot: Always choose the earliest time slot from the intersection that satisfies all constraints.\r\n\r\n5. Ensure Solution Exists: Since a solution is guaranteed for each task, always provide a valid time slot that works for everyone.\r\n\r\n6. Strict Format Requirement: Present the solution using the same format as provided in the examples. Avoid deviating from the established format.\r\n\r\nDO NOT only provide solution, also provide step-by-step plan execution.\r\n\r\nExecute the Plan Step-by-Step: [Provide executed steps]\r\n\r\nSOLUTION: [strictly use the format provided in the above examples]"

  problem_statement = data_point['prompt_5shot'].replace('Find a time that works for everyone\'s schedule and constraints. \nSOLUTION:', replace_str)

  verification_prompt_temp = """Analyze the input problem and identify all instance-specific constraints from the problem description. These may include:
- Participant schedules
- Meeting duration
- Specific preferences or restrictions
- Work hours and availability
- Any other relevant constraints based on the input problem

Input Problem:
{problem}
"""
  verification = verification_prompt_temp.format(problem=data_point['prompt_0shot'])
  verification_prompt = call_gemini(verification, model_name=model_name)

  evaluator = SelfConsistencyEvaluator(verification_prompt= verification_prompt, model_name=model_name)

  max_retries = 5
  retry_count = 0
  while retry_count < max_retries:
      try:
          responses, best_answer = evaluator.get_best_response(problem_statement, no_of_times=5)
          break
      except Exception as e:
          print(f"Error during PlanGEN (Best of N) run (attempt {retry_count + 1}): {e}")
          retry_count += 1
          if retry_count == max_retries:
              print("All retry attempts failed")
              responses = []
              best_answer = ""
          else:
              print(f"Retrying... ({retry_count + 1}/{max_retries})")
              time.sleep(2)  # Add small delay between retries

  data_point['model_prediction'] = best_answer
  data_point['n_samples'] = responses

  final_predictions.append(data_point)

### Prediction Loop for NATURAL PLAN (Trip Planning)

> Run the prediction loop for trip planning task and save in `final_predictions` list

In [None]:
final_predictions = []

for idx, data_point in tqdm(enumerate(data_trip), desc=f"Processing trip with PlanGEN (Best of N)", total=len(data_trip)):

  replace_str= """Please follow the below constraints while generating plan:\r\n- Strictly ensure the trip duration equals the specified total trip days.\r\n- First find the city order based on constraints given and follow the exact city order while generating plan\r\n- Follow the exact order and timing for events, workshops, or other fixed-date activities.\r\n- Use only direct flights between cities as provided in the flight connections list.\r\n- On the last day of your visit to each city, you can take a direct flight to the next city and arrive on the same day. Both the day you arrive and the day you depart count toward the total number of days spent in each city.\r\n- Confirm each city’s stay aligns with the constraints.\r\n\r\nProvide the itinerary strictly in the following format:\r\n\r\nDay 1-[End Day]: City Name and details of stay.\r\nDay [Start of Travel]: Fly from [City A] to [City B].\r\n\r\nSOLUTION: [strictly follow the same solution format as shown above]"""
  problem_statement = data_point['prompt_5shot'] + '\n' + replace_str

  verification_prompt = """1. Strictly ensure the trip duration equals the specified total trip days.
2. Verify the correct city order based on constraints
3. Verify the order and timing for events, workshops, or other fixed-date activities.
4. Verify that only direct flights between cities as provided in the flight connections list are used.
5. Verify each city’s stay aligns with the constraints.
"""

  evaluator = SelfConsistencyEvaluator(verification_prompt= verification_prompt, model_name=model_name)

  max_retries = 5
  retry_count = 0
  while retry_count < max_retries:
      try:
          responses, best_answer = evaluator.get_best_response(problem_statement, no_of_times=5)
          break
      except Exception as e:
          print(f"Error during PlanGEN (Best of N) run (attempt {retry_count + 1}): {e}")
          retry_count += 1
          if retry_count == max_retries:
              print("All retry attempts failed")
              responses = []
              best_answer = ""
          else:
              print(f"Retrying... ({retry_count + 1}/{max_retries})")
              time.sleep(2)  # Add small delay between retries

  data_point['model_prediction'] = best_answer
  data_point['n_samples'] = responses

  final_predictions.append(data_point)

# Run Inference on NATURAL PLAN using PlanGEN (ToT)



**Import Libraries for Evaluation**

In [None]:
import os
from os.path import join
import json
from tqdm import tqdm
import time

### Prediction Loop for NATURAL PLAN (Calendar)

> Run the prediction loop for calendar scheduling task and save in `final_predictions` list

In [None]:
final_predictions = []

for idx, data_point in tqdm(enumerate(data_calendar), desc=f"Processing calendar with PlanGEN (ToT)", total=len(data_calendar)):

    replace_str= "Plan to Find the Right Slot:\r\n\r\n1. Analyze Constraints: Carefully review all constraints provided in the problem, including participant schedules, meeting duration, and any specific preferences or restrictions.\r\n\r\n2. Identify Individual Availability: For each participant, list all their free time slots during the specified work hours.\r\n\r\n3. Determine Common Availability: Find the overlapping free time slots (intersection of availability) across all participants.\r\n\r\n4. Prioritize Earliest Time Slot: Always choose the earliest time slot from the intersection that satisfies all constraints.\r\n\r\n5. Ensure Solution Exists: Since a solution is guaranteed for each task, always provide a valid time slot that works for everyone.\r\n\r\n6. Strict Format Requirement: Present the solution using the same format as provided in the examples. Avoid deviating from the established format.\r\n\r\nDO NOT only provide solution, also provide step-by-step plan execution.\r\n\r\nExecute the Plan Step-by-Step: [Provide executed steps]\r\n\r\nSOLUTION: [strictly use the format provided in the above examples]"

    problem_statement = data_point['prompt_5shot'].replace('Find a time that works for everyone\'s schedule and constraints. \nSOLUTION:', replace_str)

    verification_prompt_temp = """Analyze the input problem and identify all instance-specific constraints from the problem description. These may include:
- Participant schedules
- Meeting duration
- Specific preferences or restrictions
- Work hours and availability
- Any other relevant constraints based on the input problem

Input Problem:
{problem}
"""
    verification = verification_prompt_temp.format(problem=data_point['prompt_0shot'])
    verification_prompt = call_gemini(verification, model_name=model_name)

    bfs_solver = BFS_sbs(
                problem=problem_statement,
                verification_prompt=verification_prompt,
                no_samples=3,
                model_name=model_name
            )

    best_answer = ''
    f_answer = ''
    retries = 0
    max_retries = 5
    while retries < max_retries:
        try:
            best_answer, f_answer = bfs_solver.run()
            break
        except Exception as e:
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries}...")
            if retries == max_retries:
                print(f"Max retries reached. Assigning empty strings.")
                best_answer = ''
                f_answer = ''

    # Solving phase
    solving_prompt = """Your task is to only format the generated plan into desired solution format given in few-shot examples.

Problem:
{question}

Generated Plan:
{plan}

Please provide your solution following the format shown in the examples above.
"""

    solving_prompt_filled = solving_prompt.format(question=data_point["prompt_5shot"], plan=best_answer)
    prediction = call_gemini(solving_prompt_filled, model_name=model_name)

    data_point['model_plan'] = best_answer
    data_point['model_prediction'] = prediction

    final_predictions.append(data_point)

### Prediction Loop for NATURAL PLAN (Trip)

> Run the prediction loop for trip planning task and save in `final_predictions` list

In [None]:
final_predictions = []

for idx, data_point in tqdm(enumerate(data_trip), desc=f"Processing trip with PlanGEN (ToT)", total=len(data_trip)):

    replace_str= """Please follow the below constraints while generating plan:\r\n- Strictly ensure the trip duration equals the specified total trip days.\r\n- First find the city order based on constraints given and follow the exact city order while generating plan\r\n- Follow the exact order and timing for events, workshops, or other fixed-date activities.\r\n- Use only direct flights between cities as provided in the flight connections list.\r\n- On the last day of your visit to each city, you can take a direct flight to the next city and arrive on the same day. Both the day you arrive and the day you depart count toward the total number of days spent in each city.\r\n- Confirm each city’s stay aligns with the constraints.\r\n\r\nProvide the itinerary strictly in the following format:\r\n\r\nDay 1-[End Day]: City Name and details of stay.\r\nDay [Start of Travel]: Fly from [City A] to [City B].\r\n\r\nSOLUTION: [strictly follow the same solution format as shown above]"""
    problem_statement = data_point['prompt_5shot'] + '\n' + replace_str

    verification_prompt = """1. Strictly ensure the trip duration equals the specified total trip days.
2. Verify the correct city order based on constraints
3. Verify the order and timing for events, workshops, or other fixed-date activities.
4. Verify that only direct flights between cities as provided in the flight connections list are used.
5. Verify each city’s stay aligns with the constraints.
"""

    bfs_solver = BFS_sbs(
                problem=problem_statement,
                verification_prompt=verification_prompt,
                no_samples=3,
                model_name=model_name
            )

    best_answer = ''
    f_answer = ''
    retries = 0
    max_retries = 5
    while retries < max_retries:
        try:
            best_answer, f_answer = bfs_solver.run()
            break
        except Exception as e:
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries}...")
            if retries == max_retries:
                print(f"Max retries reached. Assigning empty strings.")
                best_answer = ''
                f_answer = ''

    # Solving phase
    solving_prompt = """Your task is to only format the generated plan into desired solution format given in few-shot examples.

Problem:
{question}

Generated Plan:
{plan}

Please provide your solution following the format shown in the examples above.
"""

    solving_prompt_filled = solving_prompt.format(question=data_point["prompt_5shot"], plan=best_answer)
    prediction = call_gemini(solving_prompt_filled, model_name=model_name)

    data_point['model_plan'] = best_answer
    data_point['model_prediction'] = prediction

    final_predictions.append(data_point)

# Run Inference on NATURAL PLAN using PlanGEN (REBASE)

**Import Libraries for Evaluation**



In [None]:
import os
from os.path import join
import json
from tqdm import tqdm
import time

### Prediction Loop for NATURAL PLAN (Calendar)

> Run the prediction loop for calendar scheduling task and save in `final_predictions` list



In [None]:
final_predictions = []

# Load configuration for Rebase

paras = {
    "temperature": 0.7,
    "max_step_tokens": 256,
    "max_tokens": 1024,
    "select_method": "softmax",
    "num_threads": 8,
    "softmax_temperature": 0.2,
    "reward_model_type": "llemma",
    "policy_model_type": "llemma",
    "model_name": model_name,
    "store_path": "./exp_results/rebase_16/",
    "width": 10
}

# Initialize Rebase with the chosen model and parameters
rebase_solver = Rebase(model_name=paras["model_name"], paras=paras)

for idx, data_point in tqdm(enumerate(data_calendar), desc=f"Processing calendar with PlanGEN (REBASE)", total=len(data_calendar)):

    replace_str= "Plan to Find the Right Slot:\r\n\r\n1. Analyze Constraints: Carefully review all constraints provided in the problem, including participant schedules, meeting duration, and any specific preferences or restrictions.\r\n\r\n2. Identify Individual Availability: For each participant, list all their free time slots during the specified work hours.\r\n\r\n3. Determine Common Availability: Find the overlapping free time slots (intersection of availability) across all participants.\r\n\r\n4. Prioritize Earliest Time Slot: Always choose the earliest time slot from the intersection that satisfies all constraints.\r\n\r\n5. Ensure Solution Exists: Since a solution is guaranteed for each task, always provide a valid time slot that works for everyone.\r\n\r\n6. Strict Format Requirement: Present the solution using the same format as provided in the examples. Avoid deviating from the established format.\r\n\r\nDO NOT only provide solution, also provide step-by-step plan execution.\r\n\r\nExecute the Plan Step-by-Step: [Provide executed steps]\r\n\r\nSOLUTION: [strictly use the format provided in the above examples]"

    problem_statement = data_point['prompt_5shot'].replace('Find a time that works for everyone\'s schedule and constraints. \nSOLUTION:', replace_str)

    verification_prompt_temp = """Analyze the input problem and identify all instance-specific constraints from the problem description. These may include:
- Participant schedules
- Meeting duration
- Specific preferences or restrictions
- Work hours and availability
- Any other relevant constraints based on the input problem

Input Problem:
{problem}
"""
    verification = verification_prompt_temp.format(problem=data_point['prompt_0shot'])
    verification_prompt = call_gemini(verification, model_name=model_name)

    # Run Rebase to get the plan (formerly handled by BFS_sbs)
    retries = 0
    max_retries = 5
    best_answer = ''
    complete_solutions = []
    while retries < max_retries:
        try:
            complete_solutions = rebase_solver.reward_guided_search(problem_statement, verification_prompt)
            if complete_solutions:
                best_answer = complete_solutions.value if hasattr(complete_solutions, 'value') else str(complete_solutions)
            else:
                best_answer = ''
            break
        except Exception as e:
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries}...")
            if retries == max_retries:
                print("Max retries reached. Assigning empty string for best_answer.")
                best_answer = ''

    # Solving phase
    solving_prompt = """Your task is to only format the generated plan into desired solution format given in few-shot examples.

Problem:
{question}

Generated Plan:
{plan}

Please provide your solution following the format shown in the examples above.
"""

    solving_prompt_filled = solving_prompt.format(question=data_point["prompt_5shot"], plan=best_answer)
    prediction = call_gemini(solving_prompt_filled, model_name=model_name)

    data_point['model_plan'] = best_answer
    data_point['model_prediction'] = prediction

    final_predictions.append(data_point)

### Prediction Loop for NATURAL PLAN (Trip)

Run the prediction loop for trip planning task and save in `final_predictions` list

In [None]:
final_predictions = []

# Load configuration for Rebase

paras = {
    "temperature": 0.7,
    "max_step_tokens": 256,
    "max_tokens": 1024,
    "select_method": "softmax",
    "num_threads": 8,
    "softmax_temperature": 0.2,
    "reward_model_type": "llemma",
    "policy_model_type": "llemma",
    "model_name": model_name,
    "store_path": "./exp_results/rebase_16/",
    "width": 10
}

# Initialize Rebase with the chosen model and parameters
rebase_solver = Rebase(model_name=paras["model_name"], paras=paras)

for idx, data_point in tqdm(enumerate(data_trip), desc=f"Processing trip with PlanGEN (REBASE)", total=len(data_trip)):

    replace_str= """Please follow the below constraints while generating plan:\r\n- Strictly ensure the trip duration equals the specified total trip days.\r\n- First find the city order based on constraints given and follow the exact city order while generating plan\r\n- Follow the exact order and timing for events, workshops, or other fixed-date activities.\r\n- Use only direct flights between cities as provided in the flight connections list.\r\n- On the last day of your visit to each city, you can take a direct flight to the next city and arrive on the same day. Both the day you arrive and the day you depart count toward the total number of days spent in each city.\r\n- Confirm each city’s stay aligns with the constraints.\r\n\r\nProvide the itinerary strictly in the following format:\r\n\r\nDay 1-[End Day]: City Name and details of stay.\r\nDay [Start of Travel]: Fly from [City A] to [City B].\r\n\r\nSOLUTION: [strictly follow the same solution format as shown above]"""
    problem_statement = data_point['prompt_5shot'] + '\n' + replace_str

    verification_prompt = """1. Strictly ensure the trip duration equals the specified total trip days.
2. Verify the correct city order based on constraints
3. Verify the order and timing for events, workshops, or other fixed-date activities.
4. Verify that only direct flights between cities as provided in the flight connections list are used.
5. Verify each city’s stay aligns with the constraints.
"""

    # Run Rebase to get the plan (formerly handled by BFS_sbs)
    retries = 0
    max_retries = 5
    best_answer = ''
    complete_solutions = []
    while retries < max_retries:
        try:
            complete_solutions = rebase_solver.reward_guided_search(problem_statement, verification_prompt)
            if complete_solutions:
                best_answer = complete_solutions.value if hasattr(complete_solutions, 'value') else str(complete_solutions)
            else:
                best_answer = ''
            break
        except Exception as e:
            retries += 1
            print(f"Error occurred: {e}. Retrying {retries}/{max_retries}...")
            if retries == max_retries:
                print("Max retries reached. Assigning empty string for best_answer.")
                best_answer = ''

    # Solving phase
    solving_prompt = """Your task is to only format the generated plan into desired solution format given in few-shot examples.

Problem:
{question}

Generated Plan:
{plan}

Please provide your solution following the format shown in the examples above.
"""

    solving_prompt_filled = solving_prompt.format(question=data_point["prompt_5shot"], plan=best_answer)
    prediction = call_gemini(solving_prompt_filled, model_name=model_name)

    data_point['model_plan'] = best_answer
    data_point['model_prediction'] = prediction

    final_predictions.append(data_point)

# Run Inference on NATURAL PLAN using PlanGEN (Mixture of Algorithms)

**Import Libraries for Evaluation**

In [None]:
import os
from os.path import join
import json
from tqdm import tqdm
import time

### Prediction Loop for NATURAL PLAN (Calendar)

> Run the prediction loop for calendar scheduling task and save in `final_predictions` list

In [None]:
final_predictions = []

for idx, data_point in tqdm(enumerate(data_calendar), desc=f"Processing calendar with PlanGEN (Mixture of Algo.)", total=len(data_calendar)):

    replace_str= "Plan to Find the Right Slot:\r\n\r\n1. Analyze Constraints: Carefully review all constraints provided in the problem, including participant schedules, meeting duration, and any specific preferences or restrictions.\r\n\r\n2. Identify Individual Availability: For each participant, list all their free time slots during the specified work hours.\r\n\r\n3. Determine Common Availability: Find the overlapping free time slots (intersection of availability) across all participants.\r\n\r\n4. Prioritize Earliest Time Slot: Always choose the earliest time slot from the intersection that satisfies all constraints.\r\n\r\n5. Ensure Solution Exists: Since a solution is guaranteed for each task, always provide a valid time slot that works for everyone.\r\n\r\n6. Strict Format Requirement: Present the solution using the same format as provided in the examples. Avoid deviating from the established format.\r\n\r\nDO NOT only provide solution, also provide step-by-step plan execution.\r\n\r\nExecute the Plan Step-by-Step: [Provide executed steps]\r\n\r\nSOLUTION: [strictly use the format provided in the above examples]"

    problem_statement = data_point['prompt_5shot'].replace('Find a time that works for everyone\'s schedule and constraints. \nSOLUTION:', replace_str)

    verification_prompt_temp = """Analyze the input problem and identify all instance-specific constraints from the problem description. These may include:
- Participant schedules
- Meeting duration
- Specific preferences or restrictions
- Work hours and availability
- Any other relevant constraints based on the input problem

Input Problem:
{problem}
"""
    verification = verification_prompt_temp.format(problem=data_point['prompt_0shot'])
    verification_prompt = call_gemini(verification, model_name=model_name)

    orchestrator = OrchastrationUnit()
    retries = 5
    for attempt in range(retries):
        try:
            best_answer, selected_algorithm_lst, value_dict, feedback_lst, all_plans = orchestrator.execute(problem_statement, verification_prompt, model_name=model_name)
            break
        except Exception as e:
            print(f"Error during multi-agent run (attempt {attempt + 1}): {e}")
            best_answer = ""
            if attempt < retries - 1:
                print("Retrying...")
            else:
                print("All retries failed.")

    # Solving phase
    solving_prompt = """Your task is to only format the generated plan into desired solution format given in few-shot examples.

Problem:
{question}

Generated Plan:
{plan}

Please provide your solution following the format shown in the examples above.
"""

    solving_prompt_filled = solving_prompt.format(question=data_point["prompt_5shot"], plan=best_answer)
    prediction = call_gemini(solving_prompt_filled, model_name=model_name)

    data_point['model_plan'] = best_answer
    data_point['model_prediction'] = prediction

    final_predictions.append(data_point)

## Prediction Loop for NATURAL PLAN (Trip)

> Run the prediction loop for trip planning task and save in `final_predictions` list

In [None]:
final_predictions = []

for idx, data_point in tqdm(enumerate(data_trip), desc=f"Processing trip with PlanGEN (Mixture of Algo.)", total=len(data_trip)):

    replace_str= """Please follow the below constraints while generating plan:\r\n- Strictly ensure the trip duration equals the specified total trip days.\r\n- First find the city order based on constraints given and follow the exact city order while generating plan\r\n- Follow the exact order and timing for events, workshops, or other fixed-date activities.\r\n- Use only direct flights between cities as provided in the flight connections list.\r\n- On the last day of your visit to each city, you can take a direct flight to the next city and arrive on the same day. Both the day you arrive and the day you depart count toward the total number of days spent in each city.\r\n- Confirm each city’s stay aligns with the constraints.\r\n\r\nProvide the itinerary strictly in the following format:\r\n\r\nDay 1-[End Day]: City Name and details of stay.\r\nDay [Start of Travel]: Fly from [City A] to [City B].\r\n\r\nSOLUTION: [strictly follow the same solution format as shown above]"""
    problem_statement = data_point['prompt_5shot'] + '\n' + replace_str

    verification_prompt = """1. Strictly ensure the trip duration equals the specified total trip days.
2. Verify the correct city order based on constraints
3. Verify the order and timing for events, workshops, or other fixed-date activities.
4. Verify that only direct flights between cities as provided in the flight connections list are used.
5. Verify each city’s stay aligns with the constraints.
"""

    orchestrator = OrchastrationUnit()
    retries = 5
    for attempt in range(retries):
        try:
            best_answer, selected_algorithm_lst, value_dict, feedback_lst, all_plans = orchestrator.execute(problem_statement, verification_prompt, model_name=model_name)
            break
        except Exception as e:
            print(f"Error during multi-agent run (attempt {attempt + 1}): {e}")
            best_answer = ""
            if attempt < retries - 1:
                print("Retrying...")
            else:
                print("All retries failed.")

    # Solving phase
    solving_prompt = """Your task is to only format the generated plan into desired solution format given in few-shot examples.

Problem:
{question}

Generated Plan:
{plan}

Please provide your solution following the format shown in the examples above.
"""

    solving_prompt_filled = solving_prompt.format(question=data_point["prompt_5shot"], plan=best_answer)
    prediction = call_gemini(solving_prompt_filled, model_name=model_name)

    data_point['model_plan'] = best_answer
    data_point['model_prediction'] = prediction

    final_predictions.append(data_point)

# Analyze the Results

> Here, we provide code to generate table with problem statement, golden plan, and model prediction to check the correctness.

In [None]:
import pandas as pd
from IPython.display import display, HTML

def create_analysis_table(final_predictions):
    """
    Generates a Pandas DataFrame for analyzing model predictions with text wrapping.

    Args:
        final_predictions: A list of dictionaries, where each dictionary represents
                           a data point with 'prompt_5shot', 'golden_plan', and
                           'model_prediction' keys.

    Returns:
        A Pandas DataFrame containing the analysis table.
    """
    analysis_data = []
    for data_point in final_predictions:
        analysis_data.append({
            'Problem Statement': data_point['prompt_0shot'],
            'Golden Plan': data_point.get('golden_plan', 'N/A'), # Handle missing golden_plan gracefully
            'Model Prediction': data_point['model_prediction']
        })

    df = pd.DataFrame(analysis_data)

    # Apply text wrapping to all columns
    return df.style.set_properties(**{'text-align': 'left'}).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])


# Example usage (assuming final_predictions is defined as in the provided code):
analysis_table = create_analysis_table(final_predictions)
display(analysis_table)
