In [6]:
## Start 
import pyautogui
from PIL import Image
import pandas as pd
from groq import Groq
import torch 
from dotenv import load_dotenv
import os
import re
import json
import time
import numpy as np
# Load/save cache between sessions
import json


import torch

from icon_caption import load_resnet_model, load_yolo_model,load_florence_model,load_ocr,get_parsed_icons_captions



def Capture_ScreenShot(screenshot_path="screenshot.png"):
    screenshot = pyautogui.screenshot()
    screenshot.save(screenshot_path)
    return screenshot_path


def LLM(api,user_prompt,model="llama-3.3-70b-versatile",system_prompt=None,max_new_token=1024):
    messages=[
            {
                "role": "system",
                "content": f"{system_prompt}"
            },
            {
                "role": "user",
                "content": f"{user_prompt}"
            },
            ]            
    completion = api.chat.completions.create(
        model=model,
        messages=messages,
        max_completion_tokens=max_new_token,
        response_format={"type": "json_object"},
    )

    response = completion.choices[0].message.content
    json_match = re.search(r'\{.*\}',response,re.DOTALL)
    if json_match:
        json_string = json_match.group(0)
        response = json.loads(json_string)
        #print("Parsed JSON: ",json_data)
    else:
        print("No valid JSON found in the response")
        response = None
    return response

    
def Icon_Selection(api,task,parsed_content_list):
    system_prompt = f"You are an excellent GUI Automation Bot that work on windows 10. Respond only in json format"
    user_prompt = f"Given the icons information only return id of the icon in json format like {{\"id\" : \"8\"}} which needs to be clicked to complete this task {task}. Here is the icons information : {parsed_content_list}"
    icon_id = LLM(api,user_prompt,system_prompt=system_prompt,max_new_token=32)
    print(icon_id)
    icon_id = int(icon_id["id"])

    return icon_id
    
def Click_Icon(label_coordinates,icon_id):
    image = Image.open(screenshot_path)
    w , h = image.size

    # Convert coordinates from normalized to actual coordinates
    icon_coordinates = label_coordinates[icon_id]
    x1 , y1 , x2 , y2 = icon_coordinates
    x1 , y1 , x2 , y2 = int(x1 * w) , int(y1 * h) , int(x2 * w) , int(y2 * h)


    # Click at centre of icon 
    x = (x1+x2)//2
    y = (y1+y2)//2
    # pyautogui.moveTo((x1+x2)/2,(y1+y2)/2)
    pyautogui.click(x=x,y=y,clicks=2)

def Generate_Steps(task,api):
    # task = "open google chrome choose my university profile that is .nu.edu.pk , go to my google classroom and from side bar open calendar "
    template = """ Generate a structured step-by-step GUI automation plan for a given Windows 10 task.
    Rules: 
    1.Ensure steps logically follow the required task and flow and no illogical step.
    2.Each action should be precise and achievable through GUI automation (Mouse click or keyboard input).
    3.Should work for any general Windows 10 GUI task. 
    4.Only return actions that can be peformed using clicks by the bot.
    Output Format: 
    { 'steps' :[{ 'action' : 'current action' } , {'action' : 'current action' }]}
    """
    system_prompt = "You are an excellent GUI Automation Bot that work on windows 10. Respond only in json format"
    user_prompt = template + f"Here is the Task return a step by step plan according to above mentioned details : {task}"
    response = LLM(groq_api,user_prompt,model="llama-3.3-70b-versatile",system_prompt=system_prompt)
    return response

In [9]:
if __name__ == "__main__":
    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu" 
    
    # Initialize llm
    load_dotenv()  # Load environment variables from .env file
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
    groq_api = Groq(api_key=GROQ_API_KEY,)

    # Initialize Models
    resnet_path = "resnet50.pt"
    yolo_path = "weights/icon_detect/model.pt"
    florence_path = "weights/icon_caption_florence"

    resnet_model = load_resnet_model(resnet_path)
    yolo_model = load_yolo_model(yolo_path)
    florence_model , florence_processor = load_florence_model(florence_path)
    easyocr_reader , paddle_ocr = load_ocr()

    # Enable mutithreading 
    num_cores = os.cpu_count()
    torch.set_num_threads(num_cores)

    
    task = 'currently iam on desktop which have google chrome icon , open google chrome , google chrome will open you will see two profiles choose Muhammad Ahmad profile , go to google classroom icon is alredy in shortcut so when you will open chrome you will click on classroom icon , after that go to internet of things classroom '
    steps = Generate_Steps(task,groq_api)
    print(steps)
    # encoded_image = None
   # Go to Desktop
    pyautogui.hotkey('win', 'd')
    
    count = 0
    for step in steps['steps']:
        print(step)
        sub_task = step['action']
        # Capture ScreenShot
        screenshot_path = Capture_ScreenShot()
        
        # Parse ScreenShot
        icon_coordinates , icon_descriptions = get_parsed_icons_captions(screenshot_path,florence_model,florence_processor,yolo_model,paddle_ocr,easyocr_reader,resnet_model)
        
        # Select icon
        icon_id = Icon_Selection(groq_api,sub_task,icon_descriptions)
        
        # Click icon
        Click_Icon(icon_coordinates,icon_id)
        time.sleep(20)



{'steps': [{'action': 'Double click on Google Chrome icon on the desktop'}, {'action': 'Click on the Muhammad Ahmad profile'}, {'action': 'Click on the Google Classroom icon'}, {'action': 'Click on the Internet of Things classroom'}]}
{'action': 'Double click on Google Chrome icon on the desktop'}

0: 384x640 59 icons, 953.5ms
Speed: 11.0ms preprocess, 953.5ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)
len(filtered_boxes): 64 32
Uncached Icons = 0
{'id': '14'}
{'action': 'Click on the Muhammad Ahmad profile'}

0: 384x640 67 icons, 1846.1ms
Speed: 9.0ms preprocess, 1846.1ms inference, 5.0ms postprocess per image at shape (1, 3, 384, 640)
len(filtered_boxes): 74 37
Uncached Icons = 1
Time to get parsed content: 15.22726845741272
{'id': '11'}
{'action': 'Click on the Google Classroom icon'}

0: 384x640 74 icons, 1843.1ms
Speed: 8.0ms preprocess, 1843.1ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)
len(filtered_boxes): 88 33
Uncached Icons = 8
Tim