# Env

In [2]:
import dotenv
from openai import OpenAI
import json


In [3]:
dotenv.load_dotenv()
client = OpenAI()

# Creating the Functions

In [4]:
click_object = {
    "type": "function",
    "function": {
      "name": "click",
      "description": "Click at a specific pixel location",
      "parameters": {
        "type": "object",
        "properties": {
          "x": {
            "type": "number",
            "description": "The x value of where to click"
          },
          "y": {
            "type": "number",
            "description": "The y value of where to click"
          }
        },
        "required": [
          "x",
          "y"
        ]
      }
    }
}

In [5]:
get_description_of_screen_object = {
    "type": "function",
    "function": {
        "name": "get_description_of_screen",
        "description": "Fetch a text description of the user's entire screen",
    }
}

In [6]:
mdfind_object = {
    "type": "function",
    "function": {
        "name": "mdfind",
        "description": "Find files in the user's computer using Apple's mdfind command",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The query to search for"
                }
            },
            "required": ["query"]
        }
    }
}

In [7]:
open_object = {
    "type": "function",
    "function": {
        "name": "open",
        "description": "Open a file (or a folder or URL), using Apple's open command",
        "parameters": {
            "type": "object", 
            "properties": {
                "filepath": {
                    "type": "string",
                    "description": "The path to the file to open"
                },
                "reveal": {
                    "type": "boolean",
                    "description": "Whether to reveal the file in the Finder instead of opening it",
                    "default": False
                },
                "background": {
                    "type": "boolean",
                    "description": "Whether to open the app in the background",
                    "default": False
                },
            },
            "required": ["filepath"]
        }
    }
}

In [8]:
write_object = {
    "type": "function",
    "function": {
        "name": "write",
        "description": "Controls the user's keyboard to write plain text",
        "parameters": {
            "type": "object",
            "properties": {
                "text": {
                    "type": "string",
                    "description": "The text to write"
                },
            },
            "required": ["text"]
        }
    }
}

In [9]:
import pyautogui

press_object = {
    "type": "function",
    "function": {
        "name": "press",
        "description": "The key to be pressed.",
        "parameters": {
            "type": "object",
            "properties": {
                "key": {
                    "type": "string",
                    "enum": pyautogui.KEYBOARD_KEYS
                }
            },
            "required": ["key"]
        }
    }
}

In [10]:
hot_keys_object = {
    "type": "function",
    "function": {
        "name": "hot_keys",
        "description": """Performs key down presses on the arguments passed in order, then performs key releases in reverse order. The effect is that calling hotkey('ctrl', 'shift', 'c') would perform a "Ctrl-Shift-C" hotkey/keyboard shortcut press.""",
        "parameters": {
            "type": "object",
            "properties": {
                "keys": {
                    "type": "array",
                    "items": {
                        "type": "string",
                        "enum": pyautogui.KEYBOARD_KEYS
                    },
                    "description": "The keys to type"
                }
            },
            "required": ["keys"]
        }
    }
}

In [11]:
prompt_input_object = {
    "type": "function", 
    "function": {
        "name": "prompt_input",
        "description": "Prompt the user for some form of input",
        "parameters": {
            "type": "object",
            "properties": {
                "text": {
                    "type": "string",
                    "description": "The text to display to the user to prompt for input"
                }
            },
            "required": ["text"]
        }
    }
}

mark_completed_object = {
    "type": "function",
    "function": {
        "name": "mark_completed",
        "description": "Marks the overall task as completed. Ends the session with the user.",
    }
}

# assistant stuff

In [12]:
assistant = client.beta.assistants.create(
    name="Mac OS Assistant",
    instructions="""You are an assistant created by Apple to call functions to dynamically automate workflows in Mac OS. Users give you tasks, and you need to perform them. You can make reasonable assumptions about user's intents.
    
    The first thing you will do is to call `get_description_of_screen` to get a description of the user's entire screen. You will then generate a list of subtasks required to complete the overall task. After that, you will call `promptUser` to let them approve the list of subtasks. If they do, then you will proceed with functions to complete the subtasks, thereby completing the overall task.
    
    ONLY communicate through function calls. If you need to communicate with the user, call `promptUser`. 
    
    Example:
        User: Open cat.jpg
        Assistant: promptUser("I will search for the file on your computer using mdfind\n And then I will open it using open. Does that sound good?")
        User: Yes
        Asssistant: mdfind('cat.jpg')
        User: /Users/john/Downloads/cat.jpg
        Assistant: open(filepath="/Users/john/Downloads/cat.jpg")        
    
    If you're stuck, remember to call `get_description_of_screen` to get a description of the user's screen.
    """,
    tools=[click_object, get_description_of_screen_object, mdfind_object, open_object, write_object, press_object, hot_keys_object, prompt_input_object, mark_completed_object],
    model="gpt-4-1106-preview",
)

In [13]:
thread = client.beta.threads.create()

In [14]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="Send a text to Peter",
)

In [15]:
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id
)

In [57]:
import time

def wait_for_run(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id
        )
        time.sleep(0.5)
    print("Run is now " + run.status)
    return run

In [58]:
import time

from automation_for_assistants import click_on, get_description_of_screen, mdfind, open, press, write, hot_keys

completed = False

while not completed:
    run = wait_for_run(run, thread)
    while run.status in ("completed", "failed", "expired", "cancelled"):
        print('Restarting run')
        client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content="I can't read this output! If you're trying to mark the task as completed, please call `mark_completed`. And if you're trying to talk to me, please call `promptUser`",
        )
        
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=assistant.id
        )
        
        run = wait_for_run(run, thread)
    
    calls = run.required_action.submit_tool_outputs.tool_calls
    tool_outputs = []
    for tool_call in calls:
        obj = {
            "tool_call_id": tool_call.id,
            "output": ""
        }
        
        func = tool_call.function
        arguments = json.loads(func.arguments)
        print(f'Calling {func.name} with arguments ({arguments})')
        
        
        if func.name == "open":
            obj["output"] = open(
                arguments["filepath"],
                arguments["reveal"] if "reveal" in arguments else False,
                arguments["background"] if "background" in arguments else False
            )
        elif func.name == "click":
            obj["output"] = click_on(
                arguments["x"],
                arguments["y"]
            )
        elif func.name == "mdfind":
            obj["output"] = mdfind(
                arguments["query"]
            )
        elif func.name == "get_description_of_screen":
            obj["output"] = get_description_of_screen(
                client
            )
        elif func.name == "press":
            obj["output"] = press(
                arguments["key"]
            )
            
        elif func.name == "write":
            obj["output"] = write(
                arguments["text"]
            )
        elif func.name == "hot_keys":
            obj["output"] = hot_keys(
                arguments["keys"]
            )
        
        elif func.name == "prompt_input":
            obj["output"] = input(
                arguments["text"]
            )
        
        elif func.name == "mark_completed":
            completed = True
            break
        obj["output"] = json.dumps(obj["output"])
        tool_outputs.append(obj)
    
    print("TOOL OUTPUTS:")
    print(tool_outputs)
    
    run = client.beta.threads.runs.submit_tool_outputs(
        thread_id=thread.id,
        run_id=run.id,
        tool_outputs = tool_outputs
    )
    
    time.sleep(0.3)



Run is now requires_action
Calling get_description_of_screen with arguments ({})
TOOL OUTPUTS:
[{'tool_call_id': 'call_mDLXpnWbabV2fYQ54b6NxF2r', 'output': '"You\'re looking at a screenshot of Visual Studio Code, a code editor, with an open file named \\"test_assistants.ipynb.\\" The file appears to be a Jupyter notebook as indicated by the file extension `.ipynb` and the cell-based interface within Visual Studio Code.\\n\\nThe screen is split into two main areas:\\n\\n1. **Code Area (Right Side):**\\n   - The right side shows the content of the \\"test_assistants.ipynb\\" file, which includes Python code. The visible code includes part of a function or a script that utilizes a variable called `client` to interact with some kind of threading and messaging system, which seems to be operating in a beta environment (as suggested by `client.beta`). \\n   - There are two visible cells, one that has run and completed, as indicated by the checkmark next to the number [42], and a runtime of 0.

KeyboardInterrupt: Interrupted by user

: 

In [42]:
client.beta.threads.messages.list(thread_id=thread.id)

SyncCursorPage[ThreadMessage](data=[ThreadMessage(id='msg_shOzS2dHEeRMC5EKjCY4KfAo', assistant_id='asst_0nOTf7URhjMnb6dYw3bgZUfB', content=[MessageContentText(text=Text(annotations=[], value="The description of the screen suggests that the user is currently working within the Visual Studio Code (VS Code) environment. There is no indication of a messaging application or interface open on the screen where a text could be sent to someone named Peter. \n\nTo proceed, I can prompt the user to open a messaging application where they usually communicate with Peter. If the user uses a messaging app that is command-line accessible or inter-operable with AppleScript or Automator, I might be able to automate the process of sending a text. Alternatively, if the user has an iPhone set up to send messages through their Mac, we could use the Messages app and AppleScript to send the text.\n\nSubtasks:\n1. Prompt the user to specify the name of the messaging application they use to communicate with Pet