In [9]:
from dotenv import load_dotenv
from openai import OpenAI
import json
load_dotenv()

True

# High Level Prompting to get steps

In [10]:

client = OpenAI()

In [11]:
high_level_prompt = """
You are an intelligent caregiver assistant, skilled in giving specific explanations of how to use computers to elderly people. Their vision is poor, so try to avoid mouse movements in favor of keyboard inputs and commands. Avoid using terms like "click on" in favor of "focus" or "select".

You output a JSON list of strings, with each string corresponding to a step.
My grandma needs help to send a happy birthday text to Peter Liu on her Macbook Air. Can you offer specific steps for doing so?
{
    "steps": [
        "Open Spotlight",
        "Use Spotlight to open the Messages app", 
        "Create a new message",
        "Focus on the sender field",
        "Type 'Peter Liu'",
        "Focus on the body of the message",
        "Type 'Happy Birthday'", 
        "Send the message"
    ]
}
"""

In [12]:
task = "write an essay in Google Docs about monkeys"

In [13]:
completion = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  messages=[
    {"role": "system", "content": high_level_prompt},
    {"role": "user", "content": f"My grandmother needs help to {task} on her Macbook Air. Can you offer specific steps for doing so?"}
  ],
  response_format={
      "type": "json_object"
  }
)

In [16]:
text = completion.choices[0].message.content
steps = json.loads(text)
# for ind, step in enumerate(steps):
#     print(f'{ind + 1}. {step}')

In [17]:
steps

{'steps': ['Open Spotlight',
  'Use Spotlight to open the Google Chrome browser',
  'Go to the Google Docs website',
  'Sign in to her Google account',
  'Create a new document in Google Docs',
  'Focus on the document title field',
  "Type 'Essay about Monkeys'",
  'Focus on the body of the document',
  'Start writing the essay about monkeys']}

# Converting Steps into Function Calls

In [156]:
!pip install scipy
!pip install tenacity
!pip install tiktoken
!pip install termcolor 
!pip install openai
!pip install requests



In [157]:
import json
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

In [158]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "tool": "magenta",
    }
    
    for message in messages:
        if message["role"] == "system":
            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "user":
            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and message.get("function_call"):
            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and not message.get("function_call"):
            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "tool":
            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))


In [159]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "keyDown",
            "description": "Holds down a key from pyautogui.KEYBOARD_KEYS",
            "parameters": {
                "type": "object",
                "properties": {
                    "key": {
                        "type": "string",
                        "enum": ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(',
                        ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
                        '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e','f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
                        'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',
                        'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace',
                        'browserback', 'browserfavorites', 'browserforward', 'browserhome',
                        'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear',
                        'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete',
                        'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10',
                        'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20',
                        'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
                        'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja',
                        'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail',
                        'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack',
                        'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6',
                        'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn',
                        'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn',
                        'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator',
                        'shift', 'shiftleft', 'shiftright', 'sleep', 'space', 'stop', 'subtract', 'tab',
                        'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen',
                        'command', 'option', 'optionleft', 'optionright'],
                        "description": "The key to press"
                    }
                },
                "required": ["key"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "keyUp",
            "description": "Releases a key from pyautogui.KEYBOARD_KEYS",
            "parameters": {
                "type": "object",
                "properties": {
                    "key": {
                        "type": "string",
                        "enum": ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(',
                        ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
                        '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e','f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
                        'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',
                        'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace',
                        'browserback', 'browserfavorites', 'browserforward', 'browserhome',
                        'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear',
                        'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete',
                        'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10',
                        'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20',
                        'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
                        'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja',
                        'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail',
                        'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack',
                        'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6',
                        'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn',
                        'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn',
                        'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator',
                        'shift', 'shiftleft', 'shiftright', 'sleep', 'space', 'stop', 'subtract', 'tab',
                        'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen',
                        'command', 'option', 'optionleft', 'optionright'],
                        "description": "The key to release"
                    },
                },
                "required": ["key"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "write",
            "description": "Writes plain text to the screen",
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "The text to write"
                    }
                },
                "required": ["text"],
            }
        }
    }
]

In [163]:
low_level_prompt = """
You are an intelligent bot made by Apple, skilled at taking a specific action and decomposing it into function calls to help blind customers. You make reasonable assumptions about the input. Make sure to remember that users need to click enter after inputting text. Also remember users can jump around using tab.
The available functions are:

hotKeys(keys: list[str]) -- Simultaneously presses multiple keys
write(text: str) -- Writes plain text. For example, for filling input boxes. Not displayed to user.
press(key: str) -- Presses a key. For example, for hitting 'enter'.
promptUser(text: str) -- Prompts the user for input. For example, if the user's password is needed. This should be very rarely used.

You output a JSON that is used by their screenreader to tell them to perform the action.

Keys are defined by pyautogui.KEYBOARD_KEYS. This is 'backspace', 'capslock', 'space', 'enter', 'tab', 'f1', 'f2', 'f3', etc.
"""

In [166]:
example_vision_output = """
The user's screen is showing Visual Studio Code with two open files, `overall.py` and `utils.py`, within a project containing various Python files and a virtual environment. The `overall.py` tab is active, and the file contains Python code, seemingly to work with an API. 

The integrated terminal at the bottom displays a command line that has just run `python3 overall.py`, indicating an execution of the Python script that might be interfaced with an API, as noted with the output "Calling vision API."

To open Messages without using a mouse, you can press `Command` + `Space` to open Spotlight Search. Then, start typing "Messages" until the Messages app is highlighted in the search results, and press `Enter` to launch the application.
"""


example_input = "Overall, I am trying to send a message to Peter. Currently, I am trying to open the messages app."
example_output = """{
    "steps": [
        {
            "function": "hotKeys",
            "arguments": {
                "keys": ["command", "space"]
            }
        },
        {
            "function": "write",
            "arguments": {
                "text": "Messages"
            }
        },
        {
            "function": "press",
            "arguments": {
                "key": "enter"
            }
        }
    ]
}"""

In [173]:
import base64
from io import BytesIO
import pyautogui

def screen_to_base_64():
    # takes a screenshot
    # and converts to base64
    # and returns that base64
    
    im = pyautogui.screenshot()
    buffered = BytesIO()
    im.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def call_vision_api(inp_text):
    PROMPT = """
    You are an intelligent bot made by Apple to help blind Mac OS users. You describe their screens, making sure to explain the focused window, where the cursor is, and any available input boxes. There are a lot of possible details, so you try to focus on items related to the task they need help with.
    """
    
    EXAMPLE_INPUT_TEXT = "I am trying to find a video about monkeys."
    
    with open("/Users/arvindh/Downloads/compressed.jpg", "rb") as image_file:
        EXAMPLE_INPUT_IMAGE = base64.b64encode(image_file.read()).decode('utf-8')
    
    EXAMPLE_OUTPUT = """The user's screen is focused on a Safari window. In the Safari window, there are four open tabs. YouTube is the active tab. The user's input is not focused on anything in the YouTube tab. There is a YouTube icon in the top left, a search bar, a search icon, a microphone icon, various category labels, and six video recommendations. 
    """
    
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {"role": "system", "content": PROMPT},
            {"role": "user",
             "content": [
                 {
                     "type": "text", "text": EXAMPLE_INPUT_TEXT
                 },
                 {
                     "type": "image_url",
                     "image_url": f"data:image/jpeg;base64,{EXAMPLE_INPUT_IMAGE}"
                 }
             ],
            },
            {"role": "assistant", "content": EXAMPLE_OUTPUT},
            {
                "role": "user",
                "content": [
                    {
                        "type": "text", "text": inp_text
                    },
                    {
                        "type": "image_url",
                        "image_url": f"data:image/jpeg;base64,{screen_to_base_64()}"
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

In [174]:
call_vision_api("I want to open a new coding project")

PyAutoGUIException: PyAutoGUI was unable to import pyscreeze. (This is likely because you're running a version of Python that Pillow (which pyscreeze depends on) doesn't support currently.) Please install this module to enable the function you tried to call.

In [167]:
inputs = [example_input]
outputs = [example_output]

for ind, step in enumerate(steps):
    print(step)
    step_specific_prompt = ''
    if ind == 0:
        step_specific_prompt = f"I'm done with that task. Now I'm doing {task}. Right now, I need to do {step} to set me up to do {steps[ind + 1]} next. The ultimate goal is {task}."
    elif ind != len(steps) - 1:
        step_specific_prompt = f"Now I need to do {step} to set me up to do {steps[ind + 1]} next. The ultimate goal is {task}"
    else:
        step_specific_prompt = f"Now I need to {step} to {task}"
    
    inputs.append(step_specific_prompt)
    messages = [
        {"role": "system", "content": low_level_prompt},
    ]
    
    for i in range(len(outputs)):
        messages.append({
            "role": "user", "content": inputs[i]
        })
        messages.append({
            "role": "assistant", "content": outputs[i]
        })
    
    messages.append({
        "role": "user", "content": step_specific_prompt
    })
    functional_completion = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        response_format={
            "type": "json_object"
        }
    )
    
    r = json.loads(functional_completion.choices[0].message.content)
    outputs.append(json.dumps(r, indent=4))
    print(json.dumps(r, indent=4))

Open Spotlight
{
    "steps": [
        {
            "function": "hotKeys",
            "arguments": {
                "keys": [
                    "command",
                    "space"
                ]
            }
        },
        {
            "function": "write",
            "arguments": {
                "text": "chrome"
            }
        },
        {
            "function": "press",
            "arguments": {
                "key": "enter"
            }
        }
    ]
}
Use Spotlight to open the Google Chrome browser
{
    "steps": [
        {
            "function": "hotKeys",
            "arguments": {
                "keys": [
                    "command",
                    "space"
                ]
            }
        },
        {
            "function": "write",
            "arguments": {
                "text": "chrome"
            }
        },
        {
            "function": "press",
            "arguments": {
                "key": "enter"
            }

In [1]:
!pip install pushover

Collecting pushover
  Downloading pushover-0.5-py3-none-any.whl (3.0 kB)
Installing collected packages: pushover
Successfully installed pushover-0.5


In [4]:
import requests

PUSHOVER_API_KEY = 'ad5k57sfe5smijv7h998go4fv6m1r6'

requests.post(
    'https://api.pushover.net/1/messages.json',
    json={
        'token': PUSHOVER_API_KEY,
        'message': 'hi',
        'user': 'uzf72cji3nno2hef66nxzwm4b1tnq6'
    }
)

<Response [200]>