In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv
from typing import List, Dict, Any
import httpx
from datetime import datetime
import re
import json

load_dotenv()

True

In [2]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

In [None]:
tools = [
    {
        "type": "function",
        "name": "open_application",
        "description": "Open an application on the mobile device",
        "parameters": {
            "type": "object",
            "properties": {
                "appName": {
                    "type": "string",
                    "description": "Name of the application to open"
                }
            },
            "required": ["appName"],
        }
    },
    {
        "type": "function",
        "name": "take_action",
        "description": "Take an action on a node in the accessibility tree",
        "parameters": {
            "type": "object",
            "properties": {
                "nodeInfo": {
                    "type": "string",
                    "description": "Node information for the action"
                },
                "actionType": {
                    "type": "string",
                    "description": "Type of action to perform (e.g., 'click', 'scroll', 'double tap')"
                }
            },
            "required": ["nodeInfo", "actionType"],
        }
    },
    {
        "type": "function",
        "name": "input_text",
        "description": "Input text into a text field in the accessibility tree",
        "parameters": {
            "type": "object",
            "properties": {
                "nodeInfo": {
                    "type": "string",
                    "description": "Node information for the text field"
                },
                "text": {
                    "type": "string",
                    "description": "Text to input into the text field"
                }
            },
            "required": ["nodeInfo", "text"],
        }
    }
]

def open_application(appName: str):
    print(f"Opening application: {appName}")
    return input("New AccessibilityTree: ")


def take_action(nodeInfo: str, actionType: str):
    if actionType == "click":
        print(f"Clicking on {nodeInfo} actionType: {actionType}")
        return input(f"Clicking on {nodeInfo} actionType: {actionType} ")

def input_text(nodeInfo: str, text: str):
    print(f"Inputting text '{text}' into {nodeInfo}")
    return input(f"Inputting text '{text}' into {nodeInfo} ")


In [14]:
input_list = [
    {
        "role" : "developer",
        "content" : """
        You are a task executioner assistant on mobile devices. For completing the task you will interact with the accessibility tree of the device using tools.
        You will use the tool 'open_application', 'take_action', 'input_text' to interact. After taking each interactions, new accessibility tree will be provided to you.
        For 'take_action', 'input_text' tool, you need to provide only one node information and action type at a time.
        Observe the accessibility tree and take next action accordingly.
        Finish the task within 10 observation and reply Answer with this format "Task Completed".
        """
    },
    {
        "role" : "user",
        "content" : "Set navigation to The Westion Dhaka Hotel on Google Maps application."
    }
]

In [15]:
last_response = None

cnt = 0
while cnt < 10:
    response = client.responses.create(
        model="openai/gpt-oss-120b:free",
        tools=tools,
        input=input_list,
    )

    print(response.output)
    last_response = response

    input_list += response.output
    final_answer = False
    for item in response.output:
        if item.type == "function_call":
            if item.name == "take_action":
                # 3. Execute the function logic for take_action
                result = take_action(**json.loads(item.arguments))
                
                # 4. Provide function call results to the model
                input_list.append({
                    "type": "function_call_output",
                    "call_id": item.call_id,
                    "output": json.dumps({
                        "newNodeTree": result
                    })
                })
            elif item.name == "open_application":
                # 3. Execute the function logic for open_application
                result = open_application(json.loads(item.arguments))
                
                # 4. Provide function call results to the model
                input_list.append({
                    "type": "function_call_output",
                    "call_id": item.call_id,
                    "output": json.dumps({
                        "newNodeTree": result
                    })
                })
            elif item.name == "input_text":
                # 3. Execute the function logic for input_text
                result = input_text(**json.loads(item.arguments))
                
                # 4. Provide function call results to the model
                input_list.append({
                    "type": "function_call_output",
                    "call_id": item.call_id,
                    "output": json.dumps({
                        "newNodeTree": result
                    })
                })
        elif item.type == "message":
            final_answer = True
            break
    cnt += 1
    # print(input_list)
    
    if final_answer:
        print(response.output_text)
        break

    

[ResponseReasoningItem(id='rs_tmp_7k5hm6t15to', summary=[], type='reasoning', content=[Content(text='We need to open Google Maps, then set navigation to "The Westion Dhaka Hotel". Steps: open_application appName "Google Maps". Then maybe need to tap search bar, input text, select result, then tap directions, then start navigation.\n\nWe need to interact with accessibility tree; we will get tree after each action. We\'ll proceed.\n\n', type='reasoning_text')], encrypted_content=None, status=None), ResponseFunctionToolCall(arguments='{\n  "appName": "Google Maps"\n}', call_id='chatcmpl-tool-ba9d4ec75cb95fda', name='open_application', type='function_call', id='chatcmpl-tool-ba9d4ec75cb95fda', status=None)]
Opening application: {'appName': 'Google Maps'}
[ResponseReasoningItem(id='rs_tmp_673oxn42who', summary=[], type='reasoning', content=[Content(text='We need to set navigation to "The Westion Dhaka Hotel". In Google Maps, we need to input search query in the search box (search_omnibox_te