# Running the benchmarker over our stories

### Initial setups and imports

In [2]:
import os
import numpy as np
import nest_asyncio
import asyncio
from pathlib import Path
import json
from dotenv import load_dotenv

project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Presuming your directory structure is the default git one, this is set up to run in the main web-agent folder, so cd up one:
if "src" not in os.listdir():
    os.chdir("..")

load_dotenv()
nest_asyncio.apply() # to allow asynchronous calls in a jupyter notebook

### Now imports that are more specific to our project

In [3]:
from src.agents import WebAgent
from langchain_groq import ChatGroq
from src.omniparser import OmniParserConfig, OmniParser
from tests.benchmark_agent_node import select_examples, compare_agent_node

In [4]:
config = OmniParserConfig(
    som_model_path=os.path.join(
        project_root, "src", "weights", "omniparser", "icon_detect", "best.pt"
    ),
    device="cuda",
    caption_model_path=os.path.join(
        project_root, "src", "weights", "omniparser", "icon_caption_blip2"
    )
)
omniparser = OmniParser.from_config(config)  # if you want to use llava, set OmniParserConfig to have caption_model="llava"
web_agent = WebAgent(project_root=project_root, image_parser=omniparser, log_screenshots=False, tag_with_js=False)  # have to set llm if we don't want llama

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


KeyboardInterrupt: 

In [4]:

# Grab some examples
usr_stories = [file_name for file_name in os.listdir(Path(project_root) / "Examples")
               if file_name.startswith("story_")]
usr_stories.sort()
examples = []
for story in usr_stories:
    examples.extend(select_examples(file_path=Path(project_root) / "Examples" / story))

#Initialize the dictionary to capture the results from each exmaple 
all_results = []

# Test them
SAVE_EVERY = 5
file_name = "benchmark_results.json"
for i, example in enumerate(examples):
    try:
        results = asyncio.run(compare_agent_node(example, web_agent, verbose=False))
    except:
        continue
    all_results.append(results)
    if i % 5 == 0:
        with open(Path(project_root) / file_name, "w") as f:
            json.dump(all_results, f, indent=4)
            
with open(Path(project_root) / file_name, "w") as f:
    json.dump(all_results, f, indent=4)

#  results = {
#         "action_matched": action_matched,
#         "correct_action": correct_action,
#         "predicted_action": end_state['prediction']['action'],
#         "normalized_error": normalized_error,
#         "time_taken": end_time - start_time,
#         "target_window_matched": target_matched,
#         "direction_matched": direction_matched,
#     }

INFO: Found 7 examples
INFO: Found 2 examples
INFO: Found 11 examples
INFO: Found 5 examples


INFO: Found 9 examples
INFO: Found 4 examples
INFO: Found 1 examples
INFO: Found 4 examples
INFO: Found 3 examples

0: 736x1280 2 icons, 56.6ms
Speed: 4.6ms preprocess, 56.6ms inference, 466.7ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:47:46,141: INFO: logger: Step 1 | Action: Click | Action Args: [16]]



0: 736x1280 6 icons, 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 1.4ms postprocess per image at shape (1, 3, 736, 1280)


Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.
[2024-12-19 23:47:50,157: INFO: logger: Step 2 | Action: Google | Action Args: None]



0: 736x1280 (no detections), 6.9ms
Speed: 3.1ms preprocess, 6.9ms inference, 0.5ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 (no detections), 6.9ms
Speed: 3.0ms preprocess, 6.9ms inference, 0.5ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 (no detections), 6.9ms
Speed: 2.9ms preprocess, 6.9ms inference, 0.5ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 11 icons, 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:48:02,432: INFO: logger: Step 4 | Action: None | Action Args: None]



0: 736x1280 12 icons, 6.6ms
Speed: 3.2ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:48:05,927: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:48:11,640: INFO: logger: Step 5 | Action: Click | Action Args: [0]]



0: 736x1280 11 icons, 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:48:13,676: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:48:24,210: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:48:32,206: INFO: logger: Step 6 | Action: Type | Action Args: [39, '85000']]



0: 736x1280 19 icons, 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:48:34,112: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 11.000000 seconds]
[2024-12-19 23:48:46,402: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:48:53,942: INFO: logger: Step 7 | Action: Click | Action Args: [75]]



0: 736x1280 24 icons, 6.6ms
Speed: 3.1ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:48:57,884: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:49:07,935: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:49:13,566: INFO: logger: Step 1 | Action: Type | Action Args: [13, 'Mortgage rates California']]



0: 736x1280 15 icons, 6.8ms
Speed: 3.3ms preprocess, 6.8ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:49:17,330: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:49:27,025: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:49:33,744: INFO: logger: Step 2 | Action: Click | Action Args: [42]]



0: 736x1280 7 icons, 6.6ms
Speed: 3.2ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:49:35,346: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:49:44,589: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:49:51,377: INFO: logger: Step 1 | Action: Click | Action Args: [20]]



0: 736x1280 8 icons, 6.9ms
Speed: 3.3ms preprocess, 6.9ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:49:53,290: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:50:03,205: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:50:08,862: INFO: logger: Step 2 | Action: Click | Action Args: [20]]



0: 736x1280 13 icons, 6.7ms
Speed: 3.3ms preprocess, 6.7ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:50:10,266: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:50:20,174: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:50:28,731: INFO: logger: Step 3 | Action: Type | Action Args: [8, 'Delhi']]



0: 736x1280 14 icons, 7.3ms
Speed: 3.7ms preprocess, 7.3ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:50:30,094: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:50:42,539: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 10.000000 seconds]
[2024-12-19 23:50:53,303: INFO: logger: Step 4 | Action: Click | Action Args: [0, 'Search flights']]



0: 736x1280 15 icons, 6.4ms
Speed: 3.2ms preprocess, 6.4ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:50:54,789: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:51:05,894: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:51:14,925: INFO: logger: Step 5 | Action: None | Action Args: None]


{
    "thoughts": "Based on the screenshot, I can see that there are several text boxes and buttons on the page. The objective is to find the best flight tickets from Delhi to San Francisco on 21st December. * I need to enter the departure and arrival airports in the respective text boxes. * I need to select the date of travel, which is 21st December. * I need to click the 'Search flights' button to initiate the search.",
    "action": "Type",
    "args": [9, "DEL"]
}
{
    "thoughts": "Based on the screenshot, I can see that there are several text boxes and buttons on the page. The objective is to find the best flight tickets from Delhi to San Francisco on 21st December. * I need to enter the departure and arrival airports in the respective text boxes. * I need to select the date of travel, which is 21st December. * I need to click the 'Search flights' button to initiate the search.",
    "action": "Type",
    "args": [10, "San Francisco"]
}
{
    "thoughts": "Based on the screenshot,

[2024-12-19 23:51:16,303: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 10.000000 seconds]
[2024-12-19 23:51:27,891: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:51:34,720: INFO: logger: Step 6 | Action: Click | Action Args: [13]]



0: 736x1280 56 icons, 6.9ms
Speed: 3.0ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:51:40,944: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:51:47,967: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:51:55,898: INFO: logger: Step 7 | Action: None | Action Args: None]



0: 736x1280 53 icons, 6.9ms
Speed: 3.3ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:52:01,866: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:52:08,628: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 10.000000 seconds]
[2024-12-19 23:52:19,417: INFO: logger: Step 8 | Action: Click | Action Args: [0]]



0: 736x1280 18 icons, 6.6ms
Speed: 3.1ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:52:21,250: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:52:30,804: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:52:38,910: INFO: logger: Step 9 | Action: Google | Action Args: None]



0: 736x1280 5 icons, 7.1ms
Speed: 3.2ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:52:40,877: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:52:51,441: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:52:59,989: INFO: logger: Step 10 | Action: GoBack | Action Args: None]



0: 736x1280 5 icons, 10.1ms
Speed: 4.0ms preprocess, 10.1ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:53:01,619: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 10.000000 seconds]
[2024-12-19 23:53:13,344: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:53:20,424: INFO: logger: Step 11 | Action: Click | Action Args: [5]]



0: 736x1280 17 icons, 6.9ms
Speed: 3.1ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:53:23,498: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:53:32,030: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:53:39,796: INFO: logger: Step 1 | Action: Type | Action Args: [2, 'non-alcoholic eggnog recipe']]



0: 736x1280 8 icons, 6.7ms
Speed: 3.3ms preprocess, 6.7ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:53:41,619: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:53:50,077: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:53:56,025: INFO: logger: Step 3 | Action: Click | Action Args: [21]]



0: 736x1280 3 icons, 7.1ms
Speed: 3.4ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:53:57,626: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:54:04,803: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 4.000000 seconds]
[2024-12-19 23:54:09,326: INFO: logger: Step 4 | Action: Google | Action Args: None]



0: 736x1280 17 icons, 6.6ms
Speed: 3.0ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:54:13,048: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:54:21,247: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:54:28,138: INFO: logger: Step 5 | Action: Click | Action Args: [18]]



0: 736x1280 3 icons, 7.0ms
Speed: 3.3ms preprocess, 7.0ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:54:30,254: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:54:38,738: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:54:46,935: INFO: logger: Step 6 | Action: Click | Action Args: [27]]



0: 736x1280 7 icons, 6.6ms
Speed: 3.1ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:54:48,514: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:54:58,849: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:55:05,818: INFO: logger: Step 1 | Action: Click | Action Args: [20]]



0: 736x1280 8 icons, 6.8ms
Speed: 3.3ms preprocess, 6.8ms inference, 1.4ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:55:07,739: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:55:16,731: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:55:27,020: INFO: logger: Step 2 | Action: None | Action Args: None]



0: 736x1280 59 icons, 6.8ms
Speed: 3.4ms preprocess, 6.8ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:55:32,123: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:55:39,604: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:55:47,422: INFO: logger: Step 3 | Action: Click | Action Args: [7]]



0: 736x1280 55 icons, 6.9ms
Speed: 3.4ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:55:52,197: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:56:01,006: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:56:09,823: INFO: logger: Step 4 | Action: Type | Action Args: [8, 'DEL', 9, 'DXB', 10, 20, 11, 25]]



0: 736x1280 21 icons, 6.7ms
Speed: 3.2ms preprocess, 6.7ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:56:11,976: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:56:23,597: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:56:31,314: INFO: logger: Step 5 | Action: Click | Action Args: [0]]



0: 736x1280 21 icons, 6.9ms
Speed: 3.3ms preprocess, 6.9ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:56:32,849: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:56:42,111: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:56:48,780: INFO: logger: Step 6 | Action: Click | Action Args: [0]]



0: 736x1280 47 icons, 6.7ms
Speed: 3.2ms preprocess, 6.7ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:56:53,413: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:57:03,293: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:57:12,546: INFO: logger: Step 7 | Action: Type | Action Args: [14, '20 Dec 24']]



0: 736x1280 16 icons, 7.1ms
Speed: 3.1ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:57:14,589: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:57:24,996: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:57:37,545: INFO: logger: Step 8 | Action: None | Action Args: None]


{
    "thoughts": "* The objective is to find the best flight from Delhi to Dubai on 20th December and returning on 25th December.\n* The bounding boxes provided are mostly text-based, with some icons.\n* The search flights button is present in the bounding boxes, but it's not clear if it's the correct one to use.\n* There are multiple departure and arrival airport options, but no clear indication of which one to choose.\n* The dates are also present, but it's not clear how to select the correct dates.\n* There are no clear options for selecting the number of passengers or class of travel.",
    "action": "Type",
    "args": [9, "Delhi"]
}
{
    "thoughts": "* The objective is to find the best flight from Delhi to Dubai on 20th December and returning on 25th December.\n* The bounding boxes provided are mostly text-based, with some icons.\n* The search flights button is present in the bounding boxes, but it's not clear if it's the correct one to use.\n* There are multiple departure and 

[2024-12-19 23:57:39,098: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 12.000000 seconds]
[2024-12-19 23:57:52,784: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:57:59,698: INFO: logger: Step 9 | Action: Click | Action Args: [2]]



0: 736x1280 6 icons, 7.1ms
Speed: 3.0ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:58:01,491: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:58:09,848: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:58:16,087: INFO: logger: Step 1 | Action: Type | Action Args: [22, 'Launchpad.ai blog']]



0: 736x1280 16 icons, 6.6ms
Speed: 3.1ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:58:17,928: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-19 23:58:27,312: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:58:36,007: INFO: logger: Step 2 | Action: Click | Action Args: [0]]



0: 736x1280 1 icon, 8.9ms
Speed: 4.2ms preprocess, 8.9ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:58:37,767: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:58:46,301: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:58:53,286: INFO: logger: Step 3 | Action: Click | Action Args: [3]]



0: 736x1280 4 icons, 6.7ms
Speed: 3.1ms preprocess, 6.7ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:58:55,044: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 10.000000 seconds]
[2024-12-19 23:59:07,294: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:59:13,215: INFO: logger: Step 4 | Action: Click | Action Args: [7]]



0: 736x1280 18 icons, 6.9ms
Speed: 3.1ms preprocess, 6.9ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:59:15,016: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-19 23:59:25,116: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-19 23:59:31,734: INFO: logger: Step 1 | Action: Click | Action Args: [8]]



0: 736x1280 6 icons, 6.6ms
Speed: 3.0ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:59:33,269: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-19 23:59:41,202: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-19 23:59:47,395: INFO: logger: Step 1 | Action: Type | Action Args: [12, 'Pydantic']]



0: 736x1280 14 icons, 7.1ms
Speed: 3.8ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-19 23:59:49,136: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-20 00:00:00,049: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-20 00:00:06,803: INFO: logger: Step 2 | Action: Click | Action Args: [39]]



0: 736x1280 14 icons, 6.6ms
Speed: 3.3ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-20 00:00:08,833: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-20 00:00:19,123: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-20 00:00:28,105: INFO: logger: Step 3 | Action: Click | Action Args: [13]]



0: 736x1280 21 icons, 7.8ms
Speed: 3.3ms preprocess, 7.8ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-20 00:00:30,793: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-20 00:00:39,156: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-20 00:00:46,993: INFO: logger: Step 4 | Action: Type | Action Args: [1, 'Pydantic']]



0: 736x1280 6 icons, 6.6ms
Speed: 3.3ms preprocess, 6.6ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-20 00:00:48,521: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-20 00:00:56,759: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 5.000000 seconds]
[2024-12-20 00:01:02,480: INFO: logger: Step 1 | Action: Type | Action Args: [22, 'Tesla stock price Apple stock price']]



0: 736x1280 12 icons, 7.0ms
Speed: 3.3ms preprocess, 7.0ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-20 00:01:04,252: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 9.000000 seconds]
[2024-12-20 00:01:14,817: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 6.000000 seconds]
[2024-12-20 00:01:21,637: INFO: logger: Step 2 | Action: Click | Action Args: [2]]



0: 736x1280 14 icons, 6.8ms
Speed: 3.3ms preprocess, 6.8ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


[2024-12-20 00:01:23,869: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 8.000000 seconds]
[2024-12-20 00:01:34,866: INFO: _base_client: Retrying request to /openai/v1/chat/completions in 7.000000 seconds]
[2024-12-20 00:01:43,070: INFO: logger: Step 3 | Action: Answer | Action Args: ['$249.84', '$436.31']]


In [5]:
usr_stories = [file_name for file_name in os.listdir(Path(project_root) / "Examples")
               if file_name.startswith("story_")]
usr_stories.sort()
examples = []
for story in usr_stories:
    examples.extend(select_examples(file_path=Path(project_root) / "Examples" / story))

INFO: Found 7 examples
INFO: Found 2 examples
INFO: Found 11 examples
INFO: Found 5 examples
INFO: Found 9 examples
INFO: Found 4 examples
INFO: Found 1 examples
INFO: Found 4 examples
INFO: Found 3 examples


In [8]:
import pandas as pd
import numpy as np

df = pd.read_json(Path(project_root) / "benchmark_results.json")

In [13]:
df.head()

Unnamed: 0,action_matched,correct_action,predicted_action,normalized_error,time_taken,target_window_matched,direction_matched
0,True,Click,Click,175.867688,4.233876,,
1,False,Click,Google,0.0,3.996728,,
2,False,Type,Click,0.0,9.171736,,
3,False,Scroll,Type,0.0,20.566147,,
4,False,Answer,Click,0.0,21.734212,,


In [10]:
entries = len(df)
df.action_matched.sum() / entries

0.5

In [20]:
distance_from_click = np.array(df.normalized_error)[np.logical_and(np.array(df.correct_action) == "Click", np.array(df.action_matched))]

In [23]:
np.sum((distance_from_click < 1)) / len(distance_from_click)

0.14285714285714285

In [30]:
# clicks = df.loc[df["correct_action"] == "Click"]
# Google = df.loc[df["correct_action"] == "Google"]
# Type = df.loc[df["correct_action"] == "Type"]
# Scroll = df.loc[df["correct_action"] == "Scroll"]
# Answer = df.loc[df["correct_action"] == "Answer"]

actions = ["Click", "Type", "Scroll", "Answer"]  #  "Google",
dfs = {action: df.loc[df["correct_action"] == action] for action in actions}

In [31]:
for action, spec_df in dfs.items():
    print(action, "had accuracy", spec_df.action_matched.sum() / len(spec_df), "with entries", len(spec_df))

Click had accuracy 0.7 with entries 20
Type had accuracy 0.45454545454545453 with entries 11
Scroll had accuracy 0.0 with entries 3
Answer had accuracy 0.16666666666666666 with entries 6
