# Web browsing agents with langchain
### Setup agent functions

In [1]:
%pip install python-dotenv
%pip install langchain
%pip install langchain-community
%pip install playwright
%pip install tarsier
%pip install openai
!playwright install

In [1]:
from playwright.async_api import async_playwright

# Setup Playwright
p = await async_playwright().__aenter__()
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()


In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import tool

from tarsier import Tarsier, GoogleVisionOCRService
# import os
import json
from dotenv import load_dotenv
from datetime import datetime

load_dotenv("../.env.local")


# Setup Creds
# os.environ["OPENAI_API_KEY"] = "sk-..."
with open("../.tarsier.json", "r") as f:   
    google_cloud_credentials = json.load(f)

# Setup Tarsier
ocr_service = GoogleVisionOCRService(google_cloud_credentials)
tarsier = Tarsier(ocr_service)
tag_to_xpath = {}


# Define tools/actions
@tool
async def read_page() -> str:
    """
    Use to read the current state of the page
    """
    return await read_page_impl()


async def read_page_impl() -> str:
    page_text, inner_tag_to_xpath = await tarsier.page_to_text(page)
    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    await page.screenshot(path=f'../screenshots/${now}.png')
    tag_to_xpath.clear()
    tag_to_xpath.update(inner_tag_to_xpath)
    return page_text


@tool
async def click(element_id: int) -> str:
    """
    Click on an element based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]['xpath']
    print(x_path)
    element = page.locator(x_path)
    await element.scroll_into_view_if_needed()
    await page.wait_for_timeout(1000)
    await element.click()
    await page.wait_for_timeout(2000)
    return await read_page_impl()


@tool
async def type_text(element_id: int, text: str) -> str:
    """
    Input text into a textbox based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]['xpath']
    print(x_path)
    try:
        await page.locator(x_path).clear()
    except Exception as e:
        print(e)
    await page.locator(x_path).press_sequentially(text)
    return await read_page_impl()


@tool
async def press_key(key: str) -> str:
    """
    Press a key on the keyboard and return the new page state
    """
    await page.keyboard.press(key)
    await page.wait_for_timeout(2000)
    return await read_page_impl()

### GPT4-V + Tarsier

In [3]:
from langchain.prompts import ChatPromptTemplate
from langchain.agents import initialize_agent, AgentType
from langchain.chains import LLMChain

template = """
You are a web interaction agent. Use the read page tool to understand where you currently are. 
You will be passed in OCR text of a web page where element ids are to the left of elements. 

You have access to the following tools:
{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

These were previous tasks you completed:

Begin!

Question: {input}
{agent_scratchpad}"""
prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(model_name="gpt-4", temperature=0)
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Setup chain
tarsier_agent_chain = initialize_agent(
    [read_page, click, type_text],
    llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

# Go to google

In [5]:
await page.goto("https://nextjs-dashboard-nine-phi-61.vercel.app/login")
await tarsier_agent_chain.arun(
    """
    Read the page, log in with username: itaimaoz@gmail.com and password: rze_VKG1ycp0xnv5zry
    """
)

In [6]:
await tarsier_agent_chain.arun(
    """
    Click on Invoices, then create a new invoice
    """
)

In [7]:
await tarsier_agent_chain.arun(
    """
    Enter the following details:
    - Customer Name: Lee
    - Invoice Amount: 1000
    - Invoice Status: Pending

    Then submit the form
    """
)

In [16]:
await tarsier_agent_chain.arun(
    """
    Create another invoice with the following details:
    - Customer Name: Evil Rabbit
    - Invoice Amount: 2000
    - Invoice Status: Paid

    Then navigate back to the dasahbotd and check that the invoices were created
    """
)

In [13]:
await tarsier_agent_chain.arun(
    """
    Go to Invoices and delete the invoice you created for Evil Rabbit on the previous step
    """
)

In [14]:
await tarsier_agent_chain.arun(
    """
    Search for invoices for Evil Rabbit
    """
)

In [15]:
await tarsier_agent_chain.arun(
    """
    go back to Invoices. is there an option to search for invoices?
    """
)

In [16]:
await tarsier_agent_chain.arun(
    """
    so search for invoices for Evil Rabbit
    """
)

In [18]:
await tarsier_agent_chain.arun(
    """
    yes, the invoices appear as you type. so you only need to type the first few letters of the customer name. please search for invoices for Evil Rabbit
    """
)

In [19]:
await tarsier_agent_chain.arun(
    """
    look at the screen again, and double check whether you see invoices for Evil Rabbit
    """
)

In [20]:
await tarsier_agent_chain.arun(
    """
    Edit the one for $2.00 to $1000.00
    """
)

In [21]:
await tarsier_agent_chain.arun(
    """
    try again
    """
)

In [None]:
await page.goto("https://www.google.com/")
await tarsier_agent_chain.arun(
    """
    Read the page, search for OpenAI Dev day, go to the first video
    """
)

In [17]:
from google.cloud import vision

client = vision.ImageAnnotatorClient.from_service_account_info(google_cloud_credentials)

with open("../screenshots/$2024-10-30_18-48-09.png", "rb") as image_file:
    content = image_file.read()

response = client.text_detection(image={"content": content})
texts = response.text_annotations
print(texts)

In [28]:
from google.cloud import vision

client = vision.ImageAnnotatorClient.from_service_account_info(google_cloud_credentials)

with open("../screenshots/Screenshot 2024-10-30 at 19.47.25.jpg", "rb") as image_file:
    content = image_file.read()

image = vision.Image(content=content)


response = client.logo_detection(image=image)
logos = response.logo_annotations
print("logos:")
for logo in logos:
    print(logo.description)
