# Web browsing agents with langchain
### Setup agent functions

In [1]:
%pip install python-dotenv
%pip install langchain
%pip install langchain-community
%pip install playwright
%pip install tarsier
%pip install langchain-anthropic
!playwright install

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import os

# # The location of my CA File
# cert_file = "/Users/itaimaoz/Downloads/mitmproxy-ca-cert.pem"
# os.environ["REQUESTS_CA_BUNDLE"] = cert_file
# os.environ["SSL_CERT_FILE"] = cert_file
# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:8080"

In [3]:
from playwright.async_api import async_playwright

# Setup Playwright
p = await async_playwright().__aenter__()
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()


In [4]:

from langchain.agents import tool

from tarsier import Tarsier, GoogleVisionOCRService
# import os
import json
from dotenv import load_dotenv
from datetime import datetime

load_dotenv("../.env.local")


# Setup Creds
# os.environ["OPENAI_API_KEY"] = "sk-..."
with open("../.tarsier.json", "r") as f:   
    google_cloud_credentials = json.load(f)

# Setup Tarsier
ocr_service = GoogleVisionOCRService(google_cloud_credentials)
tarsier = Tarsier(ocr_service)
tag_to_xpath = {}


# Define tools/actions
@tool (response_format="content_and_artifact")
async def read_page() -> str:
    """
    Use to read the current state of the page
    """
    return await read_page_impl()


async def read_page_impl() -> str:
    page_text, inner_tag_to_xpath = await tarsier.page_to_text(page)
    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    await page.screenshot(path=f'../screenshots/${now}.png')
    tag_to_xpath.clear()
    tag_to_xpath.update(inner_tag_to_xpath)
    with open(f'../screenshots/${now}.png', 'rb') as image_file:
        image_data = image_file.read()  
    return page_text, image_data


@tool (response_format="content_and_artifact")
async def click(element_id: int) -> str:
    """
    Click on an element based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]['xpath']
    print(x_path)
    element = page.locator(x_path)
    await element.scroll_into_view_if_needed()
    await page.wait_for_timeout(1000)
    await element.click()
    await page.wait_for_timeout(2000)
    return await read_page_impl()


@tool (response_format="content_and_artifact")
async def type_text(element_id: int, text: str) -> str:
    """
    Input text into a textbox based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]['xpath']
    print(x_path)
    try:
        await page.locator(x_path).clear()
    except Exception as e:
        print(e)
    await page.locator(x_path).press_sequentially(text)
    return await read_page_impl()


@tool (response_format="content_and_artifact")
async def press_key(key: str) -> str:
    """
    Press a key on the keyboard and return the new page state
    """
    await page.keyboard.press(key)
    await page.wait_for_timeout(2000)
    return await read_page_impl()

### Claude 3.5 Sonnet + Tarsier

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain.agents import initialize_agent, AgentType
from langchain.chains import LLMChain
from langchain_anthropic import ChatAnthropic

template = """
You are a web interaction agent. Use the read page tool to understand where you currently are. 
You will be passed in OCR text of a web page and its screenshot where element ids are to the left of elements. 

You have access to the following tools:
{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

These were previous tasks you completed:

Begin!

Question: {input}
{agent_scratchpad}"""
prompt = ChatPromptTemplate.from_template(template)

llm = ChatAnthropic(model_name="claude-3-5-sonnet-20241022", temperature=0)
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Setup chain
tarsier_agent_chain = initialize_agent(
    [read_page, click, type_text],
    llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

# Go to google

  llm_chain = LLMChain(llm=llm, prompt=prompt)
  tarsier_agent_chain = initialize_agent(


In [6]:
await page.goto("https://nextjs-dashboard-nine-phi-61.vercel.app/login")
await tarsier_agent_chain.arun(
    """
    Read the page, log in with username: itaimaoz@gmail.com and password: rze_VKG1ycp0xnv5zry
    """
)

  await tarsier_agent_chain.arun(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mLet me help you log in. First, I'll read the current page state.

Action:
```{
  "action": "read_page",
  "action_input": {}
}
```

[0m
Observation: [36;1m[1;3m----------------------------------------------------
**Acme**


                               Please log in to continue.
                                [ $ 0 ] Email
                                [ # 1 ]
                                  Enter your email address

                                [ $ 2 ] Password
                                [ # 3 ]
                                  Enter password

                                 [ $ 4 ] Log in    →

                                       [ @ 5 ] Sign up
----------------------------------------------------[0m
Thought:[32;1m[1;3mI'll help you log in by entering the provided credentials. Let's start with the email address.

Action:
```{
  "action": "type_text",
  "action_input": {
    "element_id": 1,
    "te

'I have attempted to log in with the provided credentials. I entered the email (itaimaoz@gmail.com) and password (rze_VKG1ycp0xnv5zry) into their respective fields and clicked the login button. The login process has been completed.'

In [7]:
await tarsier_agent_chain.arun(
    """
    Click on Invoices, then create a new invoice
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mLet me help you navigate to Invoices and create a new invoice.

Thought: First, I need to read the current page state to locate the Invoices link.

Action:
```{
  "action": "read_page",
  "action_input": {}
}
```

[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------------------------------------------------------------
Dashboard

                                               Collected                      Pending                         Total Invoices                 Total Customers
  [       **Acme**

   [ @ 1 ]  Home                                $ 3,226.26                     $ 5,562.32                           31                              6

  [ @ 2 ]   Invoices
                                        Recent Revenue                                                 Latest Invoices
  [ @ 3 ]   Customers

                                 

"I've navigated to the Create Invoice page where you can now:\n1. Choose a customer from the dropdown\n2. Enter an amount in USD\n3. Set the invoice status (Pending or Paid)\n4. Click Create Invoice when ready\n\nWould you like me to help you fill out any specific details for the invoice?"

In [8]:
await tarsier_agent_chain.arun(
    """
    Enter the following details:
    - Customer Name: Lee
    - Invoice Amount: 1000
    - Invoice Status: Pending

    Then submit the form
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mLet me help you fill out and submit the form.

Thought: First, I should read the page to understand its current state and identify the input fields.

Action:
```{
  "action": "read_page",
  "action_input": {}
}
```

[0m
Observation: [36;1m[1;3m---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[ @ 5 ] Invoices / [ @ 6 ] Create Invoice

   [       **Acme**                             [ $ 7 ] Choose customer
                                               [ $ 8 ] Select a customer
   [ @ 1 ]   Home
                                                [ $ 9 ] Choose an amount
                                                [ # 10 ] ]
   [ @ 2 ]   Invoices
                                                     Enter USD amount

   [ @ 3 ]   Customers                         Set the invoice status

            

"I have filled out the form with the following details:\n- Customer Name: Lee (entered and selected)\n- Invoice Amount: 1000 (entered successfully)\n\nHowever, I'm encountering an issue with setting the invoice status to 'Pending'. Despite multiple attempts to set the status, the system is showing a validation error. This appears to be a technical issue with the form's status selection functionality. You may want to try refreshing the page or contact technical support to resolve this issue with the status selection feature."

In [9]:
await tarsier_agent_chain.arun(
    """
    Create another invoice with the following details:
    - Customer Name: Evil Rabbit
    - Invoice Amount: 2002
    - Invoice Status: Paid

    Then check that the invoices were created
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you create a new invoice with those details. Let me break this down into steps.

Thought: First, I need to read the current page to understand what we're working with.

Action:
```{
  "action": "read_page",
  "action_input": {}
}
```

[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------
Invoices

                          [ # 6 ]
                                Search invoices ...                                      [ @ 7 ] Create Invoice +
  **[ @Acme**

 [ @ 1 ] Home                Customer     Email         Amount      Date        Status

 [ @ 2 ] Invoices            [ % 8 ]                                                         **凰** [ @ 9 ] [ $ 10 ]
                                Evil Rabbit evil@rabbit.com $ 2,000.00 Oct 30, 2024 Paid ✓
 [ @ 3 ] Customers                                                                        

"I attempted to create a new invoice for Evil Rabbit with the specified details ($2002, Paid status), but encountered some technical limitations with the customer selection interface. I can see from the main page that Evil Rabbit already has an invoice for $2,000 marked as Paid. While I tried multiple approaches to create the new invoice, the customer selection functionality appears to be restricted in the current interface. I recommend checking if there are any system limitations on creating multiple invoices for the same customer, or if there might be a different way to select customers that I'm not able to access."

In [13]:
await tarsier_agent_chain.arun(
    """
    Go to Invoices and delete the invoice you created for Evil Rabbit on the previous step
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to navigate to the "Invoices" section first. I'll use the read_page tool to understand the current state of the page and find the correct element to click on.

Action:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[ 5 ] Invoices

                                                   [ # 7 ]                    ...
                                                             Search invoices                                                                                               [ @ 8 ] Create Invoice   +
   [        **Acme**

   [ @ 1 ]     Home                                     [ 9 ] Customer               [ 10 ] Email                     [ 11 ] Amount        [ 1

"I'm sorry, but I'm currently unable to delete the invoice for Evil Rabbit. Please try again later or delete it manually."

In [14]:
await tarsier_agent_chain.arun(
    """
    Search for invoices for Evil Rabbit
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find a search bar or a search function to look for invoices for Evil Rabbit. First, I need to read the page to understand its current state.
Action:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------
Invoices

                          [ # 6 ]   invoices ...
                                Search                                                   [ @ 7 ] Create Invoice +
  **[ @Acme**

 [ @ 1 ] Home                Customer     Email          Amount     Date        Status

 [ @ 2 ] Invoices            [ % 8 ]                                                         **凰** [ @ 9 ] [ $ 10 ]
                                Amy Burns amy@burns.com  $ 1,000.00 Oct 29, 2024 Pending
 [ @ 3 ] Customers                                                                           **凰*

"I'm sorry, but I was unable to search for invoices for 'Evil Rabbit'. The search function on the page doesn't seem to work as expected. You may want to try a different method to find these invoices, such as looking through the list manually or using a different tool or service."

In [15]:
await tarsier_agent_chain.arun(
    """
    go back to Invoices. is there an option to search for invoices?
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[ @ 5 ] Invoices / [ @ 6 ] Create Invoice

   [       **Acme**                             [ $ 7 ] Choose customer
                                               [ $ 8 ] Select a customer
   [ @ 1 ]   Home
                                                [ $ 9 ] Choose an amount
                                                [ # 10 ] ]
   [ @ 2 ]   Invoices
                                                     Enter USD amount

   [ @ 3 ]   Customers                         Set the invoice status

                                                 [ $ 11 ] $ 12 ] Pending → $ 13 ] [ 14 ] Paid✓
                                                                    

'The Invoices page has a search bar with element_id 6. This indicates that there is an option to search for invoices.'

In [16]:
await tarsier_agent_chain.arun(
    """
    so search for invoices for Evil Rabbit
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to input the search term "Evil Rabbit" into the search box. However, I don't know the element_id of the search box yet. I should read the page first to find this information.
Action:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------
Invoices

                          [ # 6 ]   invoices ...
                                Search                                                   [ @ 7 ] Create Invoice +
  **[ @Acme**

 [ @ 1 ] Home                Customer     Email          Amount     Date        Status

 [ @ 2 ] Invoices            [ % 8 ]                                                         **凰** [ @ 9 ] [ $ 10 ]
                                Amy Burns amy@burns.com  $ 1,000.00 Oct 29, 2024 Pending
 [ @ 3 ] Customers                                                

"I'm sorry, but I couldn't find any invoices for 'Evil Rabbit'."

In [18]:
await tarsier_agent_chain.arun(
    """
    yes, the invoices appear as you type. so you only need to type the first few letters of the customer name. please search for invoices for Evil Rabbit
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to type the customer name "Evil Rabbit" into the search box to find the invoices. However, I don't know the element_id of the search box yet. I should read the page first to find it.

Action:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------
Invoices

                          [ # 6 ]   invoices ...
                                Search                                                   [ @ 7 ] Create Invoice +
  **[ @Acme**

 [ @ 1 ] Home                Customer     Email          Amount     Date        Status

 [ @ 2 ] Invoices            [ % 8 ]                                                         **凰** [ @ 9 ] [ $ 10 ]
                                Amy Burns amy@burns.com  $ 1,000.00 Oct 29, 2024 Pending
 [ @ 3 ] Customers                                       

"I'm sorry, but it appears there are no invoices for 'Evil Rabbit'."

In [19]:
await tarsier_agent_chain.arun(
    """
    look at the screen again, and double check whether you see invoices for Evil Rabbit
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------
Invoices

                          [ # 6 ]
                               Evil Rabbit                                                [ @ 7 ] Create Invoice +
 [    **Acme**

 [ @ 1 ] Home               Customer    Email        Amount       Date         Status

  @ 2 ] Invoices                                                                             **凰**
 [                          [ % 8 ]                                                           [ @ 9 ] [ $ 10 ]
                                Evil Rabbit evil@rabbit.com $ 102.00 Sep 15, 2024 Pending
 [ @ 3 ] Customers
                                                                                             **凰**
                            [ % 11 ] Evil            $ 2.0

'The page contains multiple invoices for the customer "Evil Rabbit". The invoices are listed with their respective amounts, dates, and statuses. The statuses of all the invoices are "Pending".'

In [20]:
await tarsier_agent_chain.arun(
    """
    Edit the one for $2.00 to $1000.00
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the element for the $2.00 value to edit it. I'll start by reading the page.
Action:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------
Invoices

                          [ # 6 ]
                               Evil Rabbit                                                [ @ 7 ] Create Invoice +
 [    **Acme**

 [ @ 1 ] Home               Customer    Email        Amount       Date         Status

  @ 2 ] Invoices                                                                             **凰**
 [                          [ % 8 ]                                                           [ @ 9 ] [ $ 10 ]
                                Evil Rabbit evil@rabbit.com $ 102.00 Sep 15, 2024 Pending
 [ @ 3 ] Customers
                                                         

"I'm sorry, but I'm unable to clear the existing value before inputting the new one. The new value is being appended to the existing value, resulting in an incorrect amount. You may need to manually clear the existing value before inputting the new one."

In [21]:
await tarsier_agent_chain.arun(
    """
    try again
    """
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: The user's input is unclear. I need more information to provide a helpful response.
Action:
```
{
  "action": "Final Answer",
  "action_input": "Could you please provide more details or clarify your request?"
}
```[0m

[1m> Finished chain.[0m


'Could you please provide more details or clarify your request?'

In [None]:
await page.goto("https://www.google.com/")
await tarsier_agent_chain.arun(
    """
    Read the page, search for OpenAI Dev day, go to the first video
    """
)

In [17]:
from google.cloud import vision

client = vision.ImageAnnotatorClient.from_service_account_info(google_cloud_credentials)

with open("../screenshots/$2024-10-30_18-48-09.png", "rb") as image_file:
    content = image_file.read()

response = client.text_detection(image={"content": content})
texts = response.text_annotations
print(texts)

[locale: "en"
description: "Invoices\n[@Acme\n[#6]\nSearch invoices...\n[@7] Create Invoice +\n[@1]\nHome\nCustomer\nEmail\nAmount\nDate\nStatus\n[@2]\nInvoices\n[%8]\nEvil Rabbit evil@rabbit.com\n$2,000.00\nOct 30, 2024\nPaid ✓\n[@3]\nCustomers\n[%11]\nLee Robinson lee@robinson.com\n$1,000.00\nOct 30, 2024\nPending\n[@9] [$10]\n%14)\nAmy Burns amy@burns.com\n$1,000.00\nOct 29, 2024\nPending\n[@12] [$13]\n[@15] [$16]\n凰凰凰\n[%17]\n[@18]\n[$19]\nLee Robinson lee@robinson.com\n$200.00\nOct 21, 2024\nPending →\n[20] Michael Novotnymichael@novotny.com\n[@21]\n[$22]\n$302.00\nOct 1, 2024\nPending →\n[$4] Sign Out\n[23] Balazs Orban balazs@orban.com\n[@24]\n[$25]\n$200.00\nOct 1, 2024\nPaid ✓"
bounding_poly {
  vertices {
    x: 22
    y: 54
  }
  vertices {
    x: 1197
    y: 54
  }
  vertices {
    x: 1197
    y: 702
  }
  vertices {
    x: 22
    y: 702
  }
}
, description: "Invoices"
bounding_poly {
  vertices {
    x: 303
    y: 54
  }
  vertices {
    x: 387
    y: 55
  }
  vertices {
 

In [28]:
from google.cloud import vision

client = vision.ImageAnnotatorClient.from_service_account_info(google_cloud_credentials)

with open("../screenshots/Screenshot 2024-10-30 at 19.47.25.jpg", "rb") as image_file:
    content = image_file.read()

image = vision.Image(content=content)


response = client.logo_detection(image=image)
logos = response.logo_annotations
print("logos:")
for logo in logos:
    print(logo.description)


logos:
