# Simple MCTS
This notebook investigates using MCTS for a single static website where multiple actions have to be done to achieve an answer.

The task is to order a macbook with certain configurations. This website was chosen over others because it does not change upon selecting certain elements, which substantially simplifies testing as a website can simply be cached.

The start domain is [this page](https://www.apple.com/shop/buy-mac/macbook-pro/14-inch-space-gray-apple-m3-chip-with-8-core-cpu-and-10-core-gpu-8gb-memory-512gb) with the prompt "order a macbook pro 14 with 24 gb, 2 tb, fast charging and all available software"

Command to run this:
```
python run_demo.py --task_name openended --model_name openai/gpt-4o-mini --start_url https://www.apple.com/shop/buy-mac/macbook-pro/14-inch-space-gray-apple-m3-chip-with-8-core-cpu-and-10-core-gpu-8gb-memory-512gb
```

## Setup and load cached website

In [477]:
website = "https://www.apple.com/shop/buy-mac/macbook-pro/14-inch-space-gray-apple-m3-chip-with-8-core-cpu-and-10-core-gpu-8gb-memory-512gb"

human_prompt = "order a macbook pro 14 with 24 gb ram, 2 tb, fast charging and all available software"

In [79]:
# ideal actions are:
# ideal_actions = [
#     "click(1007)",  # Select 24GB unified memory
#     "click(1038)",  # Select 2TB SSD storage
#     "click(1061)",  # Select 96W USB-C Power Adapter
#     "click(1112)",  # Select Final Cut Pro software
#     "click(1135)",  # Select Logic Pro software
#     "click(1209)",  # Add to bag
# ]
ideal_actions = [
    'click(1008)',
    'click(1039)',
    'click(1062)',
    'click(1113)',
    'click(1136)',
    'click(1209)'
    ]

In [80]:
txt_file = "../output_example_2.txt"
with open(txt_file, 'r') as file:
    lines = file.readlines()

system_messages = []
prompts = []
actions = []

current_section = None

for line in lines:
    if line.startswith("System Message:"):
        current_section = "System Message"
    elif line.startswith("Prompt:"):
        current_section = "Prompt"
    elif line.startswith("Action:"):
        current_section = "Action"
    else:
        if current_section == "System Message":
            system_messages.append(line)
        elif current_section == "Prompt":
            prompts.append(line)
        elif current_section == "Action":
            actions.append(line)

system_prompt = system_messages[0].split("content='")[-1].strip()
base_prompt = prompts[0].split("content=\'")[-1].strip()
# ideal_actions = actions[:6]

In [81]:
import sys
sys.path.append("../demo_agent")
from agents.legacy.dynamic_prompting import Think, Memory, ActionSpace, Flags

flags=Flags(
    use_html=True,
    use_ax_tree=True,
    use_thinking=True,  # "Enable the agent with a memory (scratchpad)."
    use_error_logs=True,  # "Prompt the agent with the error logs."
    use_memory=False,  # "Enables the agent with a memory (scratchpad)."
    use_history=True,
    use_diff=False,  # "Prompt the agent with the difference between the current and past observation."
    use_past_error_logs=True,  # "Prompt the agent with the past error logs."
    use_action_history=True,  # "Prompt the agent with the action history."
    multi_actions=True,
    action_space="bid",
    use_abstract_example=True,  # "Prompt the agent with an abstract example."
    use_concrete_example=True,  # "Prompt the agent with a concrete example."
    use_screenshot=False,
    enable_chat=True,
    demo_mode="default",
)

think = Think(visible=lambda: flags.use_thinking)
memory = Memory(visible=lambda: flags.use_memory)
action_space = ActionSpace(flags)

def parser(text_answer):
    ans_dict = {}
    try:
        ans_dict.update(think._parse_answer(text_answer))
        ans_dict.update(memory._parse_answer(text_answer))
        ans_dict.update(action_space._parse_answer(text_answer))
    except Exception as e:
        ans_dict['action'] = None
        ans_dict['think'] = None

    return ans_dict, True, ""

In [177]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage


with open("../openai_key.txt", "r") as file:
    api_key = file.read().strip()

model = ChatOpenAI(
            model_name="gpt-4o-mini",
            temperature=0.01,
            max_tokens=2_000,
            api_key=api_key
        ).bind(logprobs=True)

## Preliminaries: clean html and build prompts

In [83]:
import re
from html.parser import HTMLParser

class HTMLCleaner(HTMLParser):
    def __init__(self):
        super().__init__()
        self.output = []
        self.tag_stack = []
        self.current_content = []
        self.interactive_elements = {
            'a', 'button', 'input', 'select', 'textarea', 'label', 'fieldset',
            'legend', 'datalist', 'output', 'option', 'optgroup'
        }

    def is_interactive(self, tag, attrs):
        if tag.lower() in self.interactive_elements:
            return True
        return any(attr[0] == 'onclick' for attr in attrs)

    def handle_starttag(self, tag, attrs):
        if tag.lower() == 'img':
            return
        
        is_interactive = self.is_interactive(tag, attrs)
        bid_attr = next((attr for attr in attrs if attr[0] == 'bid'), None)
        
        if bid_attr and not is_interactive:
            bid_attr = None

        self.tag_stack.append((tag, bid_attr, len(self.output)))
        self.current_content.append([])

    def handle_endtag(self, tag):
        if tag.lower() == 'img':
            return

        if self.tag_stack and self.tag_stack[-1][0] == tag:
            start_tag, bid_attr, start_index = self.tag_stack.pop()
            content = ''.join(self.current_content.pop()).strip()

            if content:
                if bid_attr:
                    self.output.insert(start_index, f'<{start_tag} bid="{bid_attr[1]}">')
                else:
                    self.output.insert(start_index, f'<{start_tag}>')
                self.output.append(content)
                self.output.append(f'</{tag}>')

            if self.current_content:
                self.current_content[-1].extend(self.output[start_index:])
                del self.output[start_index:]

    def handle_data(self, data):
        normalized_data = re.sub(r'(\\n|\n|\r)+', '', data)
        normalized_data = re.sub(r'\s+', ' ', normalized_data)
        if self.current_content:
            self.current_content[-1].append(normalized_data)
        else:
            self.output.append(normalized_data)

def clean_html(html_content):
    html_content = html_content.replace('\\n', '\n')
    cleaner = HTMLCleaner()
    cleaner.feed(html_content)
    return ''.join(cleaner.output).strip()

In [84]:
html = base_prompt.split("# ")[4]
c_html = clean_html(html)
len(html), len(c_html)

(288727, 32333)

In [85]:
all([ia[6:10] in c_html for ia in ideal_actions])

True

In [202]:
concrete_example = """
Concrete Example:\\n\\nHere is a concrete example of how to think about next options.
Make sure you follow this structure, but replace the content with your answer.
\\n\\n<think>\\n
Currently I see xxx on the page. The possible next steps to achieve the goal are: 
1) click on the extra memory button 
2) expand the list with bid 2341 
3) fill in field yyy with zzz 
4) click on the field that completes the order 
5) select the option u 
6) click no the home button 
7) return to the latest page 
8) select dropdown to search for option yyy 
9) click button n for xx
10) finish the task by clicking on the final button.\\n 
Now I will predict the corresponding actions for the above goals.
\\n</think>\\n\\n<action>\\
click(648)
click(2341)
fill(yyy, zzz)
click(123)
click(456)
click(789)
click(1011)
click(1213)
click(1415)
click(3214)
\\n</action>\\n\\n
"""

In [203]:
simple_action_space = """Action space:\\n\\1 type of actions are available.\\n\\nclick(bid: int)\\n    Description: Click an element.\\n    Examples:\\n        click(\\\'151\\\')\\n\\n    Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.\\nExample:\\nfill(\\\'a12\\\', \\\'example with "quotes"\\\')\\nclick(\\\'a51\\\')\\nclick(\\\'48\\\', button=\\\'middle\\\', modifiers=[\\\'Shift\\\'])\\n\\n"""

def build_action_prompt(base_prompt, actions, action_thoughts, thoughts):
    base_splits = base_prompt.split("# ")
    # change html obs
    html = base_splits[4][7:]
    base_splits[4] = "HTML:" + clean_html(html)

    hist_split = base_splits[6]
    hist_instruction = hist_split[:-4]
    hist_end = hist_split[-4:]
    act_thought_list = [a + " #" + t for a, t in zip(actions, action_thoughts)]
    new_hist = hist_instruction + " Actions: [" + ", ".join(act_thought_list) + "]; Thoughts [" + ", ".join(thoughts) + "]" + hist_end
    base_splits[6] = new_hist
    base_splits[7] = simple_action_space
    base_splits[8] = ""
    base_splits[9] = concrete_example
    new_prompt = "# ".join(base_splits)
    new_prompt += " # Final Instruction: Given that the last actions are: " + ", ".join(actions) + ", Are possible next actions? Do not pick an action that you already tried."
    return new_prompt

In [182]:
base_prompt.split("# ")[9]

'Concrete Example\\n\\nHere is a concrete example of how to format your answer.\\nMake sure to follow the template with proper tags:\\n\\n<think>\\nMy memory says that I filled the first name and last name, but I can\\\'t see any\\ncontent in the form. I need to explore different ways to fill the form. Perhaps\\nthe form is not visible yet or some fields are disabled. I need to replan.\\n</think>\\n\\n<action>\\nfill(\\\'a12\\\', \\\'example with "quotes"\\\')\\nclick(\\\'a51\\\')\\nclick(\\\'48\\\', button=\\\'middle\\\', modifiers=[\\\'Shift\\\'])\\n</action>\\n\''

In [161]:
# ground truth
actions = []
action_thoughts = []
thoughts = []

new_prompt = build_action_prompt(base_prompt, actions, action_thoughts, thoughts)
chat_messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(content=new_prompt+"Think about what to do and then predict all actions at once to complete the task."),
]
out = model.invoke(chat_messages)
ans_dict = parser(out.content)

In [162]:
print(f"Predicted:\n{ans_dict[0]['action']}")
print(f"\nIdeal:\n{ideal_actions}")

Predicted:
click('1008')  # Select 24GB unified memory
click('1039')  # Select 2TB SSD storage
click('1113')  # Select Final Cut Pro
click('1136')  # Select Logic Pro
click('1209')  # Add to Bag

Ideal:
['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(1136)', 'click(1209)']


In [222]:
import math
import copy

def get_relevant_tokens(tokens_data, target_string):
    relevant_tokens = []
    remaining = copy.deepcopy(target_string)
    started = False
    
    for token_data in tokens_data:
        if remaining.startswith(token_data['token']):
            started = True
            relevant_tokens.append(token_data)
            remaining = remaining[len(token_data['token']):]
            if not remaining:  # We've found all parts of the target string
                break
        else:
            # case if only a part but not the whole is found
            if started:
                remaining = copy.deepcopy(target_string)
                relevant_tokens = []
                started = False
    
    if remaining:
        print(f"Failed to find all parts of the target string. Remaining: {remaining}")

    return relevant_tokens

def get_string_probs(tokens_data, target_string):
    relevant_tokens = get_relevant_tokens(tokens_data, target_string)
    combined_logprob = sum(token['logprob'] for token in relevant_tokens)
    combined_probability = math.exp(combined_logprob)
    return combined_logprob, combined_probability

tokens_data = out.response_metadata['logprobs']['content']
target_string = "click('1008')"

relevant_tokens = get_relevant_tokens(tokens_data, target_string)

print("Relevant tokens:")
for token in relevant_tokens:
    print(f"Token: {token['token']}, LogProb: {token['logprob']}")

combined_logprob, combined_probability = get_string_probs(tokens_data, target_string)

print(f"\nCombined logprob: {combined_logprob}")
print(f"Combined probability: {combined_probability}")

Failed to find all parts of the target string. Remaining: click('1008')
Relevant tokens:
Failed to find all parts of the target string. Remaining: click('1008')

Combined logprob: 0
Combined probability: 1.0


In [204]:
# TODO generate several different actions that could be taken, ensure that those are at least 7
# ground truth
actions = []
action_thoughts = []
thoughts = []

new_prompt = build_action_prompt(base_prompt, actions, action_thoughts, thoughts)
chat_messages = [
    SystemMessage(content=system_prompt+" Only predict the next step and not the whole plan at once. Give the 10 most likely options for the next step."),
    HumanMessage(content=new_prompt),
]
out = model.invoke(chat_messages)
ans_dict = parser(out.content)

In [219]:
actions[0][6:10]

'1008'

In [226]:
tokens_data = out.response_metadata['logprobs']['content']
# target_string = "click(1008)\n"
actions = ans_dict[0]['action'].split("\n")
for act_string in actions:
    # target_string = act_string[6:10]
    target_string = act_string + "\n"
    combined_logprob, combined_probability = get_string_probs(tokens_data, target_string)
    print(f"Action: {act_string} Combined prob: {combined_probability}")

Action: click(1008) Combined prob: 0.6754329689296937
Action: click(1039) Combined prob: 0.9999404599163011
Action: click(1062) Combined prob: 0.9999037032548292
Action: click(1113) Combined prob: 0.9920437300749845
Action: click(1136) Combined prob: 0.9990625427459154
Action: click(1209) Combined prob: 0.9998538150977743
Action: click(1168) Combined prob: 0.9912863544603184
Action: click(1177) Combined prob: 0.9999844444153894
Action: click(1150) Combined prob: 0.9999767707355536
Action: click(945) Combined prob: 0.9998361345782552


Log-probs are kinda arbitrary, it seems like the first is quite low whereas the rest is pretty high. It does not really represent the probability of taking the action but rather the probability that this is generated next which seems to be pretty different as click(1008) would be the logically best option.

In [72]:
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field

class Action(BaseModel):
    think: str = Field(description="The goal of the next step to eventually accomplish the task")
    action: str = Field(description="The single action to accomplish the task. Should be click(<int>)")

class Plan(BaseModel):
    think: str = Field(description="The overall goal and thoughts to accomplish the task")
    plan: List[Action] = Field(description="Possible actions to take next to get closer to the goal. Should have length of at least 7")

structured_llm = model.with_structured_output(Plan, include_raw=True)
greedy_llm = model.with_structured_output(Action, include_raw=True)

In [10]:
def expand_predict(action_prompt):
    chat_messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=action_prompt),
    ]
    answer = structured_llm.invoke(chat_messages)
    return answer

def greedy_predict(action_prompt):
    chat_messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=action_prompt),
    ]
    answer = greedy_llm.invoke(chat_messages)
    return answer

In [468]:
actions = []
action_thoughts = []
thoughts = []

new_prompt = build_action_prompt(base_prompt, actions, action_thoughts, thoughts)

expand_answer = expand_predict(new_prompt)
greedy_answer = greedy_predict(new_prompt)

print([p.action for p in expand_answer.plan])
print(greedy_answer.action)

## Build MCTS

In [11]:
import numpy as np

In [12]:
class Node:
    def __init__(self, actions, action_thoughts, thoughts, parent=None, depth=0):
        self.actions = actions
        self.action_thoughts = action_thoughts
        self.thoughts = thoughts
        self.parent = parent
        self.children = []
        self.visits = 0  # add 1 to avoid division by 0 and thus never have inf value
        self.value = np.inf
        self.depth = depth

In [13]:
# define tree helper functions

def print_tree(node, indent=""):
    if len(node.actions) > 0:
        # print(f"{indent}Thoughts: {node.thoughts[-1]}")
        # print(f"{indent}Action Thoughts: {node.action_thoughts[-1]}")
        print(f"{indent}Action: {node.actions[-1]}")
        print(f"{indent}Value: {node.value}")
    print(f"{indent}Children: {len(node.children)}")
    for child in node.children:
        print_tree(child, indent + "  ")

# create all possible trajectories of actions from tree
def get_trajectories(node):
    if not node.children:
        return [[node.actions]]
    
    trajectories = []
    for child in node.children:
        child_trajectories = get_trajectories(child)
        for trajectory in child_trajectories:
            trajectories.append([node.actions] + trajectory)
    
    trajectories = [t[-1] for t in trajectories]
    return trajectories

In [68]:
import copy
import math
import random

def select(node: Node, alpha: float = 1.0) -> Node:
    while node.children:
        if len(node.children) < 1:
            return node
        node = ucb_select(node, alpha)
    return node

def greedy_select(node: Node) -> Node:
    all_zero = False
    while node.children:
        if len(node.children) < 1:
            return node
        node, all_zero = best_child(node)
    return node, all_zero

def ucb_select(node: Node, alpha: float = 1.0) -> Node:
    scores = [ucb_score(c, node.visits, alpha) for c in node.children]
    max_score = max(scores)
    max_children = [c for c, s in zip(node.children, scores) if s == max_score]
    return random.choice(max_children)

def ucb_score(node: Node, total_visits: int, alpha: float = 1.0) -> float:
    if node.visits == 0:
        return float('inf')
    return (node.value / node.visits) + alpha * math.sqrt(2 * math.log(total_visits) / node.visits)

def expand(node: Node) -> Node:
    # this should be taking an action and getting to a new state
    answer = expand_predict(build_action_prompt(base_prompt, node.actions, node.action_thoughts, node.thoughts))
    parsed_answer = answer['parsed']
    new_thought = parsed_answer.think
    for action in parsed_answer.plan:
        new_actions = node.actions + [action.action]
        new_action_thoughts = node.action_thoughts + [action.think]
        new_thoughts = node.thoughts + [new_thought]
        child = Node(copy.deepcopy(new_actions),
                     copy.deepcopy(new_action_thoughts),
                     copy.deepcopy(new_thoughts),
                     parent=node,
                     depth=node.depth+1)
        node.children.append(child)

# def expand_existing(node: None) -> Node:


def backpropagate(node: Node, reward: float):
    while node:
        node.visits += 1
        if node.value == np.inf:
            node.value = copy.deepcopy(reward)
        else:
            node.value += copy.deepcopy(reward)
        node = node.parent

def verify_success(actions):
    if actions == ideal_actions:
        return True
    return False

def best_child(node: Node) -> Node:
    all_zero = False
    if all(c.value == 0. for c in node.children):
        all_zero = True

    node_vals = [c.value/c.visits if c.visits > 0 else np.inf for c in node.children]
    max_v = max(node_vals)
    candidates = [c for v, c in zip(node_vals, node.children) if v == max_v]
    return max(candidates, key=lambda c: c.visits), all_zero

In [51]:
def get_t_value(trajectory):
    # correct_actions = 0.
    # for a, b in zip(trajectory, ideal_actions):
    #     if a == b:
    #         correct_actions += 1
    #     else:
    #         break

    # score = correct_actions / len(ideal_actions)
    score = 1
    for a, b in zip(trajectory, ideal_actions):
        if not a == b:
            score = 0
    return score

t = ['click(1008)', 'click(1038)', 'click(1062)', 'click(1112)\nclick(1136)']
t2 = ['click(1008)', 'click(1039)']
get_t_value(ideal_actions), get_t_value(t), get_t_value(t2)

(1, 0, 1)

In [20]:
# greedy baseline
max_depth = 6

actions = []
action_thoughts = []
thoughts = []

root = Node(actions, action_thoughts, thoughts)

for _ in range(max_depth):
    out = greedy_predict(build_action_prompt(base_prompt, root.actions, root.action_thoughts, root.thoughts))
    answer = out['parsed']
    root.action_thoughts += [answer.think]
    root.actions += [answer.action]
print(root.actions, get_t_value(root.actions))

["click('1008')", "click('1039')", "click('1039')", "click('1049')", "click('1049')", "click('1100')"] 0.0


In [69]:
from tqdm import tqdm

# get initial options
max_depth = 6
max_iters = 100
alpha = 0.7

actions = []
action_thoughts = []
thoughts = []

root = Node(actions, action_thoughts, thoughts)

In [71]:
success_depth = 0
for iter_idx in tqdm(range(max_iters)):
    selected_node, all_zero = greedy_select(root)

    # expansion + simulation
    remaining_depth = max_depth - selected_node.depth

    value = get_t_value(selected_node.actions)
    print(selected_node.actions, value, all_zero)
    backpropagate(selected_node, value)

    if remaining_depth > 0 and value != 0.0:
        for _ in range(remaining_depth):
            expand(selected_node)
            selected_node, all_zero = greedy_select(selected_node)
            value = get_t_value(selected_node.actions)
            backpropagate(selected_node, value)
            print(selected_node.actions, value, all_zero)

            if value == 0.0:
                break

  0%|          | 0/100 [00:00<?, ?it/s]

[] 1 False
['click(1008)'] 1 False
['click(1008)', 'click(1039)'] 1 False
['click(1008)', 'click(1039)', 'click(1062)'] 1 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)'] 1 False


100%|██████████| 100/100 [00:28<00:00,  3.55it/s]

['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(1209)'] 0 False
['click(1039)'] 0 False
['click(1049)'] 0 False
['click(1062)'] 0 False
['click(1100)'] 0 False
['click(1136)'] 0 False
['click(1209)'] 0 False
['click(1008)', 'click(1049)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1136)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1100)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1209)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1209)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1150)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1209)'] 0 False
['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(1209)'] 0 True
['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(1209)'] 0 True
['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(1209)'] 0 True
['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(12




In [61]:
# get initial options
max_depth = 6
max_iters = 100
alpha = 0.7

actions = []
action_thoughts = []
thoughts = []

root = Node(actions, action_thoughts, thoughts)

success_depth = 0
for iter_idx in tqdm(range(max_iters)):
    selected_node = select(root, alpha)

    # expansion + simulation
    remaining_depth = max_depth - selected_node.depth

    value = get_t_value(selected_node.actions)
    print(selected_node.actions, value)
    backpropagate(selected_node, value)

    if remaining_depth > 0 and value != 0.0:
        for _ in range(remaining_depth):
            expand(selected_node)
            selected_node = select(selected_node, alpha)
            value = get_t_value(selected_node.actions)
            backpropagate(selected_node, value)
            print(selected_node.actions, value)

            if value == 0.0:
                break

  0%|          | 0/100 [00:00<?, ?it/s]

[] 1


  1%|          | 1/100 [00:03<05:59,  3.63s/it]

['click(1100)'] 0
['click(1039)'] 0
['click(1209)'] 0
['click(1008)'] 1


  4%|▍         | 4/100 [00:09<03:42,  2.32s/it]

['click(1008)', 'click(1123)'] 0
['click(1136)'] 0
['click(1113)'] 0
['click(1062)'] 0
['click(1008)', 'click(1039)'] 1


100%|██████████| 100/100 [00:14<00:00,  6.70it/s]

['click(1008)', 'click(1039)', 'click(1209)'] 0
['click(1039)'] 0
['click(1062)'] 0
['click(1209)'] 0
['click(1113)'] 0
['click(1100)'] 0
['click(1136)'] 0
['click(1008)', 'click(1209)'] 0
['click(1209)'] 0
['click(1100)'] 0
['click(1113)'] 0
['click(1062)'] 0
['click(1136)'] 0
['click(1039)'] 0
['click(1008)', 'click(1209)'] 0
['click(1008)', 'click(1168)'] 0
['click(1136)'] 0
['click(1113)'] 0
['click(1062)'] 0
['click(1100)'] 0
['click(1209)'] 0
['click(1039)'] 0
['click(1008)', 'click(1100)'] 0
['click(1039)'] 0
['click(1113)'] 0
['click(1100)'] 0
['click(1062)'] 0
['click(1136)'] 0
['click(1209)'] 0
['click(1008)', 'click(1204)'] 0
['click(1008)', 'click(1039)', 'click(1100)'] 0
['click(1039)'] 0
['click(1100)'] 0
['click(1113)'] 0
['click(1062)'] 0
['click(1209)'] 0
['click(1136)'] 0
['click(1008)', 'click(1209)'] 0
['click(1039)'] 0
['click(1136)'] 0
['click(1062)'] 0
['click(1100)'] 0
['click(1209)'] 0
['click(1113)'] 0
['click(1008)', 'click(1123)'] 0
['click(1136)'] 0
['click




In [39]:
ideal_actions

['click(1008)',
 'click(1039)',
 'click(1062)',
 'click(1113)',
 'click(1136)',
 'click(1209)']

In [74]:
best_c = copy.deepcopy(root)
while best_c.children:
    print([c.actions[-1] for c in best_c.children])
    print([c.value for c in best_c.children])
    print([c.visits for c in best_c.children])
    best_c, _ = best_child(best_c)
print(f"\nBest trajectory:{best_c.actions}\nScore: {get_t_value(best_c.actions)}")

['click(1008)', 'click(1039)', 'click(1049)', 'click(1062)', 'click(1100)', 'click(1136)', 'click(1209)']
[4, 0, 0, 0, 0, 0, 0]
[98, 1, 1, 1, 1, 1, 1]
['click(1039)', 'click(1049)']
[3, 0]
[96, 1]
['click(1062)']
[2]
[95]
['click(1113)', 'click(1136)', 'click(1100)', 'click(1209)', 'click(1209)', 'click(1150)', 'click(1209)']
[1, 0, 0, 0, 0, 0, 0]
[88, 1, 1, 1, 1, 1, 1]
['click(1209)']
[0]
[87]

Best trajectory:['click(1008)', 'click(1039)', 'click(1062)', 'click(1113)', 'click(1209)']
Score: 0


Greedy predict seems to be more reasonable than UCB as UCB needs quite some iterations to find something usefull

In [None]:
# TODO make this work with Thopson Sampling and use logprobs as prior

In [None]:
# find way to reliably generate n possible actions

# get logprobs and parse

In [78]:
from openai import OpenAI
import os
import json
from typing import List, Tuple

# Ensure you've set your OpenAI API key in your environment variables
client = OpenAI(api_key=api_key)

def generate_responses_with_logprobs(prompt: str, n: int = 7) -> List[Tuple[str, float]]:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Generate {n} distinct and varied responses to the given prompt. Ensure that each response is unique and doesn't repeat ideas from the others. Format your response as a JSON array of objects, where each object has 'response' and 'logprob' keys. Set the 'logprob' to 0 for now."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        n=1,  # We only need one completion as we're asking for multiple responses in JSON format
    )

    # Parse the JSON response
    try:
        responses = json.loads(response.choices[0].message.content)
        return [(item['response'], item['logprob']) for item in responses]
    except json.JSONDecodeError:
        print("Error: Could not parse JSON response. Raw response:")
        print(response.choices[0].message.content)
        return []

def ensure_distinct_responses(responses: List[Tuple[str, float]], n: int = 7) -> List[Tuple[str, float]]:
    distinct_responses = []
    seen = set()

    for response, logprob in responses:
        if response not in seen:
            distinct_responses.append((response, logprob))
            seen.add(response)

    while len(distinct_responses) < n:
        additional_responses = generate_responses_with_logprobs(prompt, n=(n - len(distinct_responses)))
        for response, logprob in additional_responses:
            if response not in seen:
                distinct_responses.append((response, logprob))
                seen.add(response)
                if len(distinct_responses) == n:
                    break

    return distinct_responses[:n]

# Example usage
prompt = "What are some creative ways to reduce plastic waste?"
responses_with_logprobs = ensure_distinct_responses(generate_responses_with_logprobs(prompt))

for i, (response, logprob) in enumerate(responses_with_logprobs, 1):
    print(f"{i}. Response: {response}")
    print(f"   Log Probability: {logprob}")
    print()

Error: Could not parse JSON response. Raw response:
```json
[
    {
        "response": "Implement a community-based swap program where people can exchange items like clothing, household goods, and toys instead of buying new plastic products.",
        "logprob": 0
    },
    {
        "response": "Encourage local businesses to adopt a 'bring your own container' policy, providing discounts to customers who bring their own reusable bags, mugs, or containers.",
        "logprob": 0
    },
    {
        "response": "Create art installations using discarded plastic items to raise awareness about plastic pollution and inspire community engagement in recycling efforts.",
        "logprob": 0
    },
    {
        "response": "Develop a subscription service that delivers bulk household products in reusable containers, helping to eliminate single-use plastic packaging from everyday purchases.",
        "logprob": 0
    },
    {
        "response": "Host workshops and classes teaching people how

KeyboardInterrupt: 