In [None]:
import subprocess

AGENT_BROWSER = "/Users/hariom/.nvm/versions/node/v24.13.0/bin/agent-browser"

def run(cmd: str):
    parts = cmd.split()          
    parts[0] = AGENT_BROWSER     

    result = subprocess.run(
        parts,
        capture_output=True,
        text=True
    )
    return result.stdout

In [None]:
SYSTEM_PROMPT = """You are a browser automation agent.
Your task is to achieve the user's GOAL by controlling a browser using a CLI tool.

STRICT OUTPUT RULES:
1. Analysis: Mentally break the GOAL into a plan of atomic steps.
2. Selection: Identify exactly ONE command from the ALLOWED list that accomplishes the CURRENT step.
3. Use the previous input as context and do not reissue commands that have already been applied.
4. Final Output: Output only one CMD from the ALLOWED COMMANDS using references from Current Snapshot
5. NO markdown, NO code blocks, NO explanations, NO extra text.
6. Select commands only using information available in the Current Snapshot.
 

ALLOWED COMMANDS:
open <url>              # Navigate to a website
click <ref>             # Click element using its reference (e.g., click e3)
dblclick <ref>          # Double-click element
fill <ref> <text>       # Clear and fill input field
type <ref> <text>       # Type into element without clearing
press <key>             # Press key (Enter, Tab, Control+a)
hover <ref>             # Hover over element
select <ref> <val>      # Select dropdown option
check <ref>             # Check checkbox
uncheck <ref>           # Uncheck checkbox
scroll <dir> [px]       # Scroll (up/down/left/right)
screenshot              # Take a screenshot
snapshot                # Get accessibility tree with refs
done                    # Output ONLY 'done' when the GOAL is fully met

USAGE RULES:
- References MUST be taken from the provided snapshot [ref=x]. 
- Never guess a reference; if not found, use 'snapshot' to refresh.
- If the GOAL is searching, use 'fill' followed by 'press Enter' in the next turn.

BEHAVIOR:
- Divide complex GOALS into: [Navigate] -> [Locate] -> [Interact] -> [Verify].
- Act conservatively. Do not skip steps.
- If you reach the final state of the GOAL, respond ONLY with 'stop'.
"""

In [None]:
import re
from openai import OpenAI

def llm(user_prompt: str):
    client = OpenAI(
    base_url="http://0.0.0.0:8000/v1",
    api_key="Local"
    )

    response = client.chat.completions.create(
        model="local-model", 
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ],
    )

    raw = response.choices[0].message.content

    # Remove <think>...</think> Tokens
    cmd = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
    return str(cmd)

In [None]:
def build_prompt(goal,last_action, snapshot):
    return f"""GOAL:
{goal}
LAST ACTION:
{last_action if last_action else "NONE"}

CURRENT SNAPSHOT:
{snapshot}
"""

In [None]:
goal = "Go to youtube and search for Virtual Insight."
last_action = "Browser Loaded"
step = 0

while True:
    step += 1
    print(f"\n========== STEP {step} ==========")

    snapshot = run("agent-browser snapshot -i")

    prompt = build_prompt(
        goal=goal,
        last_action=last_action,
        snapshot=snapshot
    )

    print("\n--- PROMPT SENT TO LLM ---")
    print(prompt)

    cmd = llm(user_prompt=prompt)

    print("\n--- LLM OUTPUT (CMD) ---")
    print(cmd)

    if not cmd or cmd.lower() == "stop":
        print("\n--- STOP CONDITION MET ---")
        break

    print("\n--- EXECUTING ---")
    print("agent-browser", cmd)

    run("agent-browser " + cmd)

    last_action = cmd
