Source: https://langchain-ai.github.io/langgraph/tutorials/web-navigation/web_voyager/

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

USE_GROQ = True  # False uses OpenAI

os.getenv("GROQ_CLOUD_API_KEY")

In [None]:
# Presuming your directory structure is the default git one, this is set up to run in the main web-agent folder, so cd up one:
if "src" not in os.listdir():
    os.chdir("..")

In [None]:
import nest_asyncio
nest_asyncio.apply() # to allow asynchronous calls in a jupyter notebook

In [None]:
from typing import List, Optional
from typing_extensions import TypedDict

from langchain_core.messages import BaseMessage, SystemMessage
from playwright.async_api import Page

class BBox(TypedDict):
    x: float
    y: float
    text: str
    type: str
    ariaLabel: str

class Prediction(TypedDict):
    action: str
    args: Optional[List[str]]

class AgentState(TypedDict):
    page: Page
    input: str
    img: str #b64 encoded screenshot
    bboxes: List[BBox]
    prediction: Prediction # agent's output
    scratchpad: List[BaseMessage] # A system message or messages containing the intermediate steps
    observation: str # the most recent response from a tool

There will be 6 tools:

1. Click (at labeled box)
2. Type
3. Scroll
4. Wait
5. Go Back
6. Go to search engine (Google)

we define those now

In [None]:
import asyncio
import platform

In [None]:
async def click(state: AgentState):
    # - Click [Numerical_label]
    page = state['page']
    click_args = state['prediction']['args']
    if click_args is None or len(click_args) != 1:
        return f"Failed to click bounding box labeled as number {click_args}"
    bbox_id = click_args[0]
    bbox_id = int(bbox_id)
    try:
        bbox = state['bboxes'][bbox_id]
    except Exception:
        return f"Error: no bbox for : {bbox_id}"
    
    x, y = bbox['x'], bbox['y']
    await page.mouse.click(x,y)
    # TODO: In the paper, they automatically parse any downloaded PDFs
    # We could add something similar here as well and generally
    # improve response format.
    return f"Clicked {bbox_id}"


In [None]:
async def type_text(state: AgentState):
    page = state['page']
    type_args = state['prediction']['args']
    if type_args is None or len(type_args) != 2:
        return f"Failed to type in element from bounding box labeled as number {type_args}"
    bbox_id = type_args[0]
    bbox_id = int(bbox_id)
    bbox = state['bboxes'][bbox_id]
    x, y = bbox['x'], bbox['y']
    text_content = type_args[1]
    await page.mouse.click(x, y)

    # check if MacOS
    select_all = "Meta+A" if platform.system() == "Darwin" else "Control+A"
    await page.keyboard.press(select_all)
    await page.keyboard.press("Backspace")
    await page.keyboard.type(text_content)
    await page.keyboard.press("Enter")
    return f"Typed {text_content} and submitted"


In [None]:
async def scroll(state: AgentState):
    page = state['page']
    scroll_args = state['prediction']['args']
    if scroll_args is None or len(scroll_args) != 2:
        return "Failed to scroll due to incorrect arguments"
    
    target, direction = scroll_args

    if target.upper() == 'WINDOW':
        # Not sure of the best value for this...
        scroll_amount = 500
        scroll_direction = (
            -scroll_amount if direction.lower() == 'up' else scroll_amount
        )
        await page.evaluate(f"window.scrollBy(0, {scroll_direction})")
    else:
        # scrolling within a specific element
        scroll_amount=200
        target_id = int(target)
        bbox = state['bboxes'][target_id]
        x, y = bbox['x'], bbox['y']
        scroll_direction = (
            -scroll_amount if direction.lower() == 'up' else scroll_amount
        )
        await page.mouse.move(x, y)
        await page.mouse.wheel(0, scroll_direction)
    return f"Scrolled {direction} in {'window' if target.upper() == 'WINDOW' else 'element'}"

In [None]:
async def wait(state: AgentState):
    sleep_time = 5
    await asyncio.sleep(sleep_time)
    return f"Waited for {sleep_time}s"

async def go_back(state: AgentState):
    page = state['page']
    await page.go_back()
    return f"Navigated back to page to {page.url}"

async def to_google(state: AgentState):
    page = state['page']
    await page.goto("https://www.google.com/")
    return "Navigated to google.com"

## Define the agent

The agend is driven by a multimodal model and decided the action to take for each step. It is composed of a few runnable objects:
1. A `mark_page` function to annotate the current page with bounding boxes. 
2. A prompt to hold the user question, annotated image, and agent scratchpad
3. GPT-4V to decide the next steps
4. Parsing logic to extract the action

We implement this below. We start with the annotation step

### Browser annotation

In [None]:
import base64
from langchain_core.runnables import chain as chain_decorator

# some javascript we will run on each step to take a screenshot of the page, select the elements to annotate, and add bounding boxes
with open("src/mark_page.js") as f:
    mark_page_script = f.read()

@chain_decorator
async def mark_page(page):
    await page.evaluate(mark_page_script)
    for _ in range(10):
        try:
            bboxes = await page.evaluate("markPage()")
            break
        except Exception:
            # may be loading
            asyncio.sleep(3)
    screenshot = await page.screenshot()
    # Ensure the bboxes don't follow us around
    await page.evaluate("unmarkPage()")
    return {
        "img": base64.b64encode(screenshot).decode('utf-8'),
        "bboxes": bboxes
    }

### Agent Definition

In [None]:
from langsmith import Client
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
import re

async def annotate(state):
    marked_page = await mark_page.with_retry().ainvoke(state["page"])
    return {**state, **marked_page}


def format_descriptions(state):
    labels = []
    for i, bbox in enumerate(state["bboxes"]):
        text = bbox.get("ariaLabel") or ""
        if not text.strip():
            text = bbox["text"]
        el_type = bbox.get("type")
        labels.append(f'{i} (<{el_type}/>): "{text}"')
    bbox_descriptions = "\nValid Bounding Boxes:\n" + "\n".join(labels)
    return {**state, "bbox_descriptions": bbox_descriptions}


def parse(text: str) -> dict:
    # set up regex for prefixes for our two points of interest
    thought_prefix = r"(?i)thoughts*:*\** *-* *\n*"
    action_prefix = r"(?i)\n *\**action:*\** *-* *\n*"  # Just combining all prefixes seen so far

    # Extract the thought
    text_by_thought = re.split(thought_prefix, text, 1)
    if len(text_by_thought) > 1:
        put_back_in_action = " action " # In case the thoughts included the word action and we accidentally cut it out
        # The logic below is that we want everything from "Thought:" until the last "Action:", so we split by "Action:" and take every block
        # but the last, but since the word "action" might be in the thoughts, everywhere we split we add back in the word "action", and cut it 
        # back off at the end
        thought = "".join(phrase + put_back_in_action for phrase in re.split(action_prefix, text_by_thought[1])[:-1])[:-len(put_back_in_action)].strip()
    else:
        thought = "cannot find the thought"

    # action_prefix = "Action: "
    # if not all_text[-1].startswith(action_prefix):
    text_by_action = re.split(action_prefix, text)
    if len(text_by_action) < 2:
        return {"action": "retry", "args": f"Could not parse LLM Action in: {text.strip()},\nWe will mark this as the thought", "thought": text}
    action_block = text_by_action[-1].strip()
    split_output = action_block.split(" ", 1)

    # Extract the parameters for the function, and what function to use
    if len(split_output) == 1:
        action, action_input = split_output[0], None # If there's no parameters, just set the action
    else:
        action, action_input = split_output
    action = action.strip()
    if action_input is not None:
        action_input = [
            inp.strip("\"'[]") for inp in re.split(";[\t ]*|,[\t ]*|\t[\t ]*|  *\"|  *'", action_input.strip())  # Cut the parameters string into individual parameters
        ]
        if len(action_input) == 1 and len(action_input[0].split(" ")) == 2:
            action_input = [inp.strip("\"'[]") for inp in action_input[0].split(" ")]
    return {"action": action, "args": action_input, "thought": thought}



In [None]:
parsed_dict = parse("""
                    I'm not going to engage in this topic matter""")
print(repr(parsed_dict["args"]))

Now build the scraper tool

In [None]:
import json
import os
from typing import List
import asyncio
import aiofiles

from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_core.runnables import RunnableLambda, Runnable

from src.custom_types import AgentState


class RateInformation(BaseModel):
    """Information about a pricing"""
    rate: str = Field(..., description="The interest rate value as a string (e.g. %2.37)")
    description: str = Field(..., description="A brief description of the financial instrument")
    organization: str = Field(..., description="A brief description of the lender or the financial provider")

class Rates(BaseModel):
    """Identifying all pricing information present in the screenshot or text."""
    prices: List[RateInformation] = Field(..., description="A list of pricing information found on the page")


parser = PydanticOutputParser(pydantic_object=Rates)

SCRAPER_PROMPT = ChatPromptTemplate.from_messages(
    [
        (
            "user", 
            [
                {
                    "type": "text",
                    "text": """You are a specialized AI who is a part of a multi-agent system tasked with the scraping interest rate and other financial information from screenshots of websites.
                    Below you are given a screenshot of a website. Analyze the image and extract all interest rate or other financial details.
                    Return them as a JSON-formatted string that conforms to the following schema:
                    {format_instructions}
                    Return only a JSON-formatted string and nothing else.
                    If no pricing information is found, return `None`."""
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "data:image/png;base64,{img}"}
                }
            ]
        )
    ]
).partial(format_instructions=parser.get_format_instructions())

class Scraper(Runnable):
    def __init__(self, mm_llm, prompt, save_file: str = "scraped_data.jsonl"):
        super().__init__()
        self.chain = prompt | mm_llm
        self.save_file = save_file

        if not os.path.isfile(self.save_file):
            with open(self.save_file, "w"):
                pass

    async def run(self, state: AgentState):
        img_data = state.get('img')
        if not img_data:
            raise ValueError("Image path is missing from the state.")
        
        
        print(f"Passing image to chain...")
        result = await self.chain.ainvoke({"img": img_data}) ;        
        state['observation'] = result.content # TODO(dominic): is this the best way to do this? Or should we modify how we record observations...

        async with aiofiles.open(self.save_file, "a") as f:
            json_line = json.dumps(result.content)
            await f.write(json_line + "\n")

        return state

    async def ainvoke(self, state: AgentState, *args, **kwargs):
        return await self.run(state)

    def invoke(self, state: AgentState, *args, **kwargs):
        # TODO(dominic) probably not best practice to do this...
        return asyncio.create_task(self.run(state))


In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder


prompt = ChatPromptTemplate.from_messages(
    [
        ("human", """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task. In each iteration, you will receive an Observation that includes a screenshot of a webpage and some texts. This screenshot will
feature Numerical Labels placed in the TOP LEFT corner of each Web Element. Carefully analyze the visual
information to identify the Numerical Label corresponding to the Web Element that requires interaction, then follow
the guidelines and choose one of the following actions:

1. Click a Web Element.
2. Delete existing content in a textbox and then type content.
3. Scroll up or down.
4. Wait 
5. Go back
6. Scrape
7. Return to google to start over.
8. Respond with the final answer

Correspondingly, Action should STRICTLY follow the format:

- Click [Numerical_Label] 
- Type [Numerical_Label]; [Content] 
- Scroll [Numerical_Label or WINDOW]; [up or down] 
- Wait 
- GoBack
- Scrape
- Google
- ANSWER; [content]

Key Guidelines You MUST follow:

* Action guidelines *
1) Execute only one action per iteration.
2) When clicking or typing, ensure to select the correct bounding box.
3) Numeric labels lie in the top-left corner of their corresponding bounding boxes and are colored the same.

* Web Browsing Guidelines *
1) Don't interact with useless web elements like Login, Sign-in, donation that appear in Webpages
2) Select strategically to minimize time wasted.

Your reply should strictly follow the format:

Thought: {{Your brief thoughts (briefly summarize the info that will help ANSWER)}}
Action: {{One Action format you choose}}
Then the User will provide:
Observation: {{A labeled screenshot Given by User}}
"""),
    MessagesPlaceholder("scratchpad", optional=True),
        (
            "user",
            [
                {
                    "type": "image_url",
                    "image_url": {"url": "data:image/jpeg;base64,{img}"},
                },
                {
                    'type': 'text',
                    'text': "{bbox_descriptions}"
                },
                {
                    'type': 'text',
                    'text': "{input}"
                }
                
            ],
        ),
    ]
)

In [None]:
prompt

In [None]:
from logging import Logger
from langchain_groq import ChatGroq

def logger(text):
    f = open("log.txt", "w")
    f.write(text)
    f.close()

class Print_interrupter(ChatGroq):
    def invoke(self, *args, **kwargs):
        logger("".join([str(arg) for arg in args]))
        return super().invoke(*args, **kwargs)
    def ainvoke(self, *args, **kwargs):
        logger("".join([str(arg) for arg in args]))
        return super().ainvoke(*args, **kwargs)
    def __call__(self, *args, **kwargs):
        logger("".join([str(arg) for arg in args]))
        return super().__call__(*args, **kwargs)


In [None]:
from langchain_core.runnables import Runnable

# Define a custom logging step to print the prompt
class PrintRunnable(Runnable):
    def invoke(self, input, config, **kwargs):
        print("Formatted prompt before sending to LLM:")
        print(input)
        return input 
    
    async def ainvoke(self, input, config, **kwargs):
        print("Formatted prompt before sending to LLM:")
        print(input)  # Print the formatted input to check
        return input  # Pass the input to the next step in the pipeline
    
def print_llm_output(input):
    print(input)
    return input



In [None]:
from langchain_groq import ChatGroq

if USE_GROQ:
    llm = ChatGroq(model="llama-3.2-90b-vision-preview", max_tokens=4096)
else:
    llm = ChatOpenAI(model="gpt-4o", max_tokens=4096)

agent = annotate | RunnablePassthrough.assign(
    prediction=format_descriptions | prompt | llm | StrOutputParser() | parse
)
scraper_tool = Scraper(mm_llm=llm, prompt=SCRAPER_PROMPT)


### Compile the graph

In [None]:
import re
from langchain_core.messages import HumanMessage


def update_scratchpad(state: AgentState):
    """After a tool is invoked, we want to update
    the scratchpad so the agent is aware of its previous steps"""
    old = state.get("scratchpad")
    if old:
        txt = old[0].content
        last_line = txt.rsplit("\n", 1)[-1]
        step = int(re.match(r"\d+", last_line).group()) + 1
    else:
        txt = "Previous action observations:\n"
        step = 1
    txt += f"\n{step}. {state['observation']}"

    return {**state, "scratchpad": [HumanMessage(content=txt)]}

Nolwo we can compose the graph

In [None]:
from langchain_core.runnables import RunnableLambda

from langgraph.graph import END, START, StateGraph

graph_builder = StateGraph(AgentState)


graph_builder.add_node("agent", agent)
graph_builder.add_edge(START, "agent") # START -> agent

graph_builder.add_node("update_scratchpad", update_scratchpad)
graph_builder.add_edge("update_scratchpad", "agent")

tools = {
    "Click": click,
    "Type": type_text,
    "Scroll": scroll,
    "Wait": wait,
    "GoBack": go_back,
    "Google": to_google,
}


for node_name, tool in tools.items():
    graph_builder.add_node(
        node_name,
        # The lambda ensures the function's string output is mapped to the "observation"
        # key in the AgentState
        RunnableLambda(tool) | (lambda observation: {"observation": observation}),
    )
    # Always return to the agent (by means of the update-scratchpad node)
    graph_builder.add_edge(node_name, "update_scratchpad")

# Add in the scraper
graph_builder.add_node("Scrape", scraper_tool)
graph_builder.add_edge("Scrape", "update_scratchpad")


def select_tool(state: AgentState):
    # Any time the agent completes, this function
    # is called to route the output to a tool or
    # to the end user.
    action = state["prediction"]["action"]
    if action.startswith("ANSWER"):
        return END
    if action == "retry":
        return "agent"
    return action


graph_builder.add_conditional_edges("agent", select_tool)

graph = graph_builder.compile()

In [None]:
from IPython.display import Image, display

try:
    display(Image(graph.get_graph().draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass

In [None]:
from IPython import display
from playwright.async_api import async_playwright
import langchain

langchain.debug = False

browser = await async_playwright().start()
# We will set headless=False so we can watch the agent navigate the web.
browser = await browser.chromium.launch(headless=False, args=None)
page = await browser.new_page()
_ = await page.goto("https://www.google.com/")

# initial_state = {
#     "page": page,
#     "input": "Please find the best mortgage rate available to me using bankrate.com. My zip code is 90210 and the purchase price is $400,000 with a down payment of $85,000. My credit score is 800.",
#     "scratchpad": [],
# }

# result = await agent.ainvoke(initial_state)


async def call_agent(question: str, page, max_steps: int = 150):
    event_stream = graph.astream(
        {
            "page": page,
            "input": question,
            "scratchpad": [],
        },
        {
            "recursion_limit": max_steps,
        },
    )
    final_answer = None
    steps = []
    async for event in event_stream:
        # We'll display an event stream here
        if "agent" not in event:
            continue
        pred = event["agent"].get("prediction") or {}
        print(pred)
        action = pred.get("action")
        action_input = pred.get("args")
        display.clear_output(wait=False)
        steps.append(f"{len(steps) + 1}. {action}: {action_input}")
        print("\n".join(steps))
        display.display(display.Image(base64.b64decode(event["agent"]["img"])))
        if "ANSWER" in action:
            final_answer = action_input[0]
            break
    return final_answer

In [None]:
res = await call_agent("What is the weather in London?", page)
print(f"Final response: {res}")

In [None]:
import os

In [None]:
os.getenv("GROQ_API_KEY")