In [None]:
"""
voice_navigator.py – voice-controlled Firefox assistant
(cleaned-up: English only, no extra GPT translation hop)
"""

import io, json, os, time, urllib.parse
import openai, speech_recognition as sr
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    ElementClickInterceptedException,
    ElementNotInteractableException,
)
from webdriver_manager.firefox import GeckoDriverManager

# ─────────────────────────────────────────────────────────────────────────────
#  ENV / KEYS
# ─────────────────────────────────────────────────────────────────────────────
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
MODEL_CHAT     = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
MODEL_WHISPER  = "whisper-1"
client = openai.OpenAI()

# ─────────────────────────────────────────────────────────────────────────────
#  SPEECH SETTINGS
# ─────────────────────────────────────────────────────────────────────────────
r = sr.Recognizer()
r.pause_threshold, r.energy_threshold = 0.8, 300

# ─────────────────────────────────────────────────────────────────────────────
#  BROWSER BOOTSTRAP
# ─────────────────────────────────────────────────────────────────────────────
service   = Service(GeckoDriverManager().install())
driver    = None
listening = True
MAX_BADGES = 20
SEL = "a, button"

# ─────────────────────────────────────────────────────────────────────────────
#  JAVASCRIPT HELPERS
# ─────────────────────────────────────────────────────────────────────────────
JS_UPDATE_LIST = """
const cap = arguments[0];
const visible = [...document.querySelectorAll('%s')].filter(el => {
  if (!el.isConnected) return false;
  const r = el.getBoundingClientRect();
  const cs = getComputedStyle(el);
  return (
    r.width && r.height &&
    r.bottom > 0 && r.top < innerHeight &&
    cs.visibility !== 'hidden' &&
    cs.display   !== 'none'   &&
    cs.pointerEvents !== 'none'
  );
}).slice(0, cap);

window._vnVisibleElements = visible;
visible.forEach((el, i) => el.dataset.vnIdx = i + 1);

return visible.map((el, i) => {
  const txt = (el.innerText || el.getAttribute('aria-label') || '∅')
                .trim().replace(/\\n+/g,' ');
  return `${i + 1}. ${el.tagName} – '${txt.slice(0, 80)}'`;
});
""" % SEL

JS_DRAW_BADGES = """
document.querySelectorAll('.vn-badge').forEach(b => b.remove());
(window._vnVisibleElements || []).forEach((el, i) => {
  const r = el.getBoundingClientRect();
  const badge = document.createElement('div');
  badge.className = 'vn-badge';
  badge.textContent = i + 1;
  Object.assign(badge.style, {
    position: 'fixed',
    left:    `${r.left - 18}px`,
    top:     `${r.top  }px`,
    background: '#ffc',
    padding: '1px 3px',
    border: '1px solid #333',
    fontSize: '10px',
    fontFamily: 'monospace',
    color: '#000',
    zIndex: 2147483647,
    pointerEvents: 'none'
  });
  document.body.appendChild(badge);
});
"""

# ─────────────────────────────────────────────────────────────────────────────
#  DOM HELPERS
# ─────────────────────────────────────────────────────────────────────────────
def refresh_visible_elements(max_items=MAX_BADGES):
    if driver is None:
        return []
    labels = driver.execute_script(JS_UPDATE_LIST, max_items)
    driver.execute_script(JS_DRAW_BADGES)
    return labels

def click_element(el, timeout=3):
    try:
        driver.execute_script(
            "arguments[0].scrollIntoView({block:'center',inline:'center'});", el
        )
        WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(el))
        el.click()
    except (ElementClickInterceptedException, ElementNotInteractableException):
        driver.execute_script("arguments[0].click();", el)

def get_state():
    if driver is None:
        return {"tabs":[], "scroll":{"y":0,"max":0}}
    try:
        tabs=[]
        for i,h in enumerate(driver.window_handles):
            driver.switch_to.window(h)
            title = driver.execute_script("return document.title") or "(untitled)"
            tabs.append({"index":i,"title":title})
        driver.switch_to.window(driver.window_handles[-1])
        y   = driver.execute_script("return window.scrollY")
        maxy= driver.execute_script("return document.body.scrollHeight")
        return {"tabs":tabs, "scroll":{"y":y,"max":maxy}}
    except Exception as e:
        print("state error:", e)
        return {"tabs":[], "scroll":{"y":0,"max":0}}

# ─────────────────────────────────────────────────────────────────────────────
#  GPT FUNCTION SCHEMA
# ─────────────────────────────────────────────────────────────────────────────
tools=[{
 "type":"function",
 "function":{
   "name":"browser_action",
   "description":"Control Firefox",
   "parameters":{
     "type":"object",
     "properties":{
       "action":{"type":"string",
                 "enum":["open","click","scroll","close_tab",
                         "switch_tab","search"]},
       "url":{"type":"string"},
       "new_tab":{"type":"boolean"},
       "direction":{"type":"string","enum":["up","down"]},
       "amount":{"type":"integer"},
       "tab_index":{"type":"integer"},
       "element_index":{"type":"integer"},
       "query":{"type":"string"}
     },
     "required":["action"]
   }
 }
}]

# ─────────────────────────────────────────────────────────────────────────────
#  GPT CALL
# ─────────────────────────────────────────────────────────────────────────────
def ask_gpt_for_action(cmd:str):
    prompt_ctx  = "\n".join(refresh_visible_elements()) or "[no visible elements]"
    prompt_state= json.dumps(get_state(), indent=2)

    sys=f"""
You are a browser assistant. Respond ONLY by calling browser_action.
Clarifications are disabled; when unsure, pick the *most likely* action.

Visible elements:
{prompt_ctx}

Browser state:
{prompt_state}

Examples:
• "click two" → action=click, element_index=2
• "search for best pizza dough" → action=search, query="best pizza dough"
• "open https://example.com" → action=open, url=...
"""

    resp = client.chat.completions.create(
        model       = MODEL_CHAT,
        temperature = 0,
        messages = [
            {"role":"system","content":sys},
            {"role":"user"  ,"content":cmd}
        ],
        tools       = tools,
        tool_choice = "auto"
    )
    return resp.choices[0].message

# ─────────────────────────────────────────────────────────────────────────────
#  EXECUTE TOOL CALL
# ─────────────────────────────────────────────────────────────────────────────
def handle_ai_command(msg):
    global driver
    if not msg.tool_calls:
        print("🤖", msg.content)
        return

    call   = json.loads(msg.tool_calls[0].function.arguments)
    action = call["action"]

    # auto-start the browser if not yet running
    if driver is None:
        driver = webdriver.Firefox(service=service)
        driver.get("about:blank")

    try:
        if action == "open":
            # fallback: GPT gave element index not URL
            if "url" not in call and "element_index" in call:
                action = "click"
            else:
                url, nt = call["url"], call.get("new_tab", False)
                if nt:
                    driver.execute_script("window.open(arguments[0],'_blank');", url)
                    driver.switch_to.window(driver.window_handles[-1])
                else:
                    driver.get(url)
                return

        if action == "search":
            query = call["query"]
            url   = "https://www.google.com/search?q=" + \
                    urllib.parse.quote_plus(query)
            driver.get(url)

        elif action == "click":
            idx = call.get("element_index")
            if idx is None:
                return
            el = driver.execute_script(
                "return window._vnVisibleElements?.[arguments[0]-1];", idx
            )
            if el:
                click_element(el)

        elif action == "scroll":
            amt  = call.get("amount", 500)
            sign = 1 if call.get("direction","down")=="down" else -1
            driver.execute_script("window.scrollBy(0, arguments[0]);", sign*amt)

        elif action == "close_tab":
            driver.close()
            if driver.window_handles:
                driver.switch_to.window(driver.window_handles[-1])

        elif action == "switch_tab":
            ti = call.get("tab_index",0)
            if 0<=ti<len(driver.window_handles):
                driver.switch_to.window(driver.window_handles[ti])

    except Exception as e:
        print("❌", e)

    time.sleep(0.2)

# ─────────────────────────────────────────────────────────────────────────────
#  MICROPHONE → WHISPER  (English only)
# ─────────────────────────────────────────────────────────────────────────────
def listen_for_command(timeout=None, limit=4):
    with sr.Microphone() as src:
        audio = r.listen(src, timeout=timeout, phrase_time_limit=limit)

    wav = io.BytesIO(audio.get_wav_data()); wav.name = "speech.wav"

    transcript = client.audio.transcriptions.create(
        model           = MODEL_WHISPER,
        file            = wav,
        language        = "en",      # expect English
        temperature     = 0,
        response_format = "text"
    ).strip().lower()

    print("🗣️ Heard:", transcript)
    return transcript

# ─────────────────────────────────────────────────────────────────────────────
#  MAIN LOOP
# ─────────────────────────────────────────────────────────────────────────────
with sr.Microphone() as src:
    r.adjust_for_ambient_noise(src, duration=1.5)
    print("🎤 Ready for voice commands!")

    while True:
        try:
            cmd = listen_for_command() if listening else \
                  listen_for_command(timeout=None, limit=3)

            if cmd in ("stop listening","pause listening"):
                listening=False; print("⏸️ Paused. Say 'continue listening' to resume."); continue
            if not listening and cmd in ("continue listening","start listening"):
                listening=True;  print("▶️ Resumed."); continue

            if listening:
                handle_ai_command(ask_gpt_for_action(cmd))

        except sr.WaitTimeoutError:
            continue
        except KeyboardInterrupt:
            break
        except Exception as e:
            print("⚠️", e)
        time.sleep(0.3)


🎤 Ready for voice commands!
🗣️ Heard: open google.com
🗣️ Heard: search how to get the cat off my lap.
🗣️ Heard: scroll down.
🗣️ Heard: scroll down.
🗣️ Heard: scroll down.
🗣️ Heard: click on the quora link
🗣️ Heard: click on the open the first link
🗣️ Heard: you
🗣️ Heard: scroll down
🗣️ Heard: click on tell me more
🗣️ Heard: you
🗣️ Heard: you
🗣️ Heard: create a new tab.
❌ 'url'
🗣️ Heard: you


In [5]:
import openai
from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI()

model_name = os.getenv("OPENAI_MODEL", "gpt-4-turbo")

try:
    print(f"🧪 Testing model: {model_name}...")
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": "Say hello"}],
    )
    print(f"✅ Model '{model_name}' responded:")
    print(response.choices[0].message.content)
except Exception as e:
    print(f"❌ Error testing model '{model_name}':")
    print(e)
    exit(1)


🧪 Testing model: gpt-4.1-mini...
✅ Model 'gpt-4.1-mini' responded:
Hello! How can I help you today?
