In [4]:
import google.generativeai as genai
import ast
import os
import json
import PIL.Image
from selenium.webdriver.common.by import By

# Load environment variables
from dotenv import load_dotenv
from selenium_functions import open_browser

load_dotenv()

# Convert the GEMINI_API_KEYS string from environment variables to a list
GEMINI_API_KEYS = os.environ.get("GEMINI_API_KEYS")
KEY_LIST = ast.literal_eval(GEMINI_API_KEYS)

# Global index to keep track of the current key
current_api_key_index = 0

from lxml import html


def extract_elements_by_xpath(html_string, xpath_selector):
    # Parse the HTML
    tree = html.fromstring(html_string)

    # Apply the XPath selector
    elements = tree.xpath(xpath_selector)

    # Return a list of outer HTML for each element
    return str(
        [html.tostring(element).decode("utf-8") for element in elements]
        + [xpath_selector]
    )
    


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
system_prompt_interpret = """
You are a web browser navigation assistant that trims and scrapes relevant portions of the UI for a user.

You have access to one function that allows you to navigate to a URL.

Whenever a user requests something, you will return an xpath selector, id selector, or src attribute of an image that is relevant to the user.
If the current url is not relevant to the user's request, you can navigate to a new URL.

If the request requires multiple choices, return ALL RELEVANT selectors that contains the UI that will enable the user to choose the choice themselves.

For example, if there is a container containing two buttons, and it is ambiguous which button the user is interested in, return a selector to the container instead of one of the buttons only.

If there are images that are relevant to the user, return the src attribute of the image.

Output your result in the following format:

If the user is interested in a specific part of the UI, output your result in the following format:
[
    {
        "type": xpath
        "selector": the selection string
    },
    {
        "type": id
        "selector": the selection string
    },
    {
        "type": src
        "selector": the src attribute of the image
    }
]
"""

system_prompt_generate = """
You are a web browser navigation assistant that generates a user interface for a user to interact with.
You will be given DOM elements from another web browser navigation assistant that trims and scrapes relevant portions of the UI for a user.

Your task is to generate valid HTML strings that can be rendered in a browser, specifically focusing on interactive elements such as buttons and text fields. 
Please use TailwindCSS for styling.

Each element should only have two attributes:
- class: a string of classes separated by spaces, for TailwindCSS styling
- special-id: the XPath or id selector that was given to you, which will be used for identifying the element during interactions
- type: a string that is either 'text', 'button', 'input', or 'img'

Only output images if they are contained in the DOM elements that were given to you.

Output your result in the following format:
<div class='container classes here'>
    <div type='text' class='input classes here'>
        <!-- Additional content here -->
    </div>
    <div type='button' class='button classes here' special-id='button selector here'">
        <!-- Additional content here -->
    </div>
    <input type='input' class='input classes here' special-id='input selector here'>
    <img type='img' class='img classes here' src='image source here'>
</div>
"""
# Load website.html into a string
with open("website.html", "r") as file:
    website_html = file.read()

def navigate(url: str):
    """Navigates to the url"""
    print(f"NAVIGATE: {url}")
    return "Navigated"

def get_html(url: str):
    """Get the HTML of the page"""
    print(f"GET_HTML: {url}")
    return website_html


def cycle_api_key():
    global current_api_key_index
    if current_api_key_index >= len(KEY_LIST) - 1:
        current_api_key_index = 0
    else:
        current_api_key_index += 1
    return KEY_LIST[current_api_key_index]


def generate_content_with_cycling_keys(prompt, system_prompt, image=None, tools=None):
    global current_api_key_index
    # Get the current API key and cycle to the next one for future requests
    api_key = cycle_api_key()

    # Configure the generative AI model with the new API key
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(
        "gemini-1.5-pro-latest",
        generation_config=genai.GenerationConfig(
            max_output_tokens=8000,
            temperature=0,
        ),
        system_instruction=system_prompt,
        tools=tools,
    )

    # Generate content using the provided prompt
    if image is None:
        response = model.generate_content(prompt)
    else:
        response = model.generate_content([prompt, image])
    return response

In [8]:
user_prompt = f"""
user: Show me the deals for dominos.com
"""

tools = [navigate, get_html]

# Generate content using the prompt and the website HTML
response = generate_content_with_cycling_keys(user_prompt, system_prompt_interpret, tools=tools)
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({'candidates': [{'content': {'parts': [{'function_call': {'name': 'navigate', 'args': {'url': 'https://www.dominos.com/en/pages/order/#!/menu/category/all/'}}}], 'role': 'model'}, 'finish_reason': 1, 'index': 0, 'safety_ratings': [{'category': 10, 'probability': 1, 'blocked': False}, {'category': 8, 'probability': 1, 'blocked': False}, {'category': 9, 'probability': 1, 'blocked': False}, {'category': 7, 'probability': 1, 'blocked': False}], 'token_count': 0, 'grounding_attributions': []}]}),
)