In [2]:
import cv2
import torch
import numpy as np
import pyautogui
from ultralytics import YOLO

# Load YOLOv8 model (pre-trained on COCO dataset, which includes cats & dogs)
model = YOLO("yolov8n.pt")  # Downloaded automatically if not found

# COCO dataset class labels (relevant ones)
class_names = {16: "Dog", 17: "Cat"}  # COCO assigns 16 to Dog, 17 to Cat

In [3]:
def detect_object(frame):
    """Detects a cat or dog in the frame and returns its label + position."""
    results = model(frame)  # Run YOLO object detection
    detections = []

    for result in results:
        for box in result.boxes:
            cls = int(box.cls.item())  # Class index
            if cls in class_names:
                label = class_names[cls]
                x1, y1, x2, y2 = map(int, box.xyxy[0])  # Bounding box
                confidence = box.conf.item()  # Confidence score

                if confidence > 0.7:  # Threshold to filter weak detections
                    detections.append((label, (x1, y1, x2, y2)))

    return detections if detections else None

In [4]:
device = torch.device("cpu")
model.to(device)  # Move model to CPU

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [5]:
def find_and_click():
    """Captures screen, detects object, and clicks if found."""
    screenshot = pyautogui.screenshot()
    frame = np.array(screenshot)
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    detections = detect_object(frame)

    if detections:
        for label, (x1, y1, x2, y2) in detections:
            x, y = (x1 + x2) // 2, (y1 + y2) // 2  # Click center of object
            pyautogui.moveTo(x, y, duration=0.2)
            pyautogui.click()
            print(f"Clicked on {label} at ({x}, {y})!")

In [6]:
# Run in a loop
try:
    while True:
        find_and_click()
except KeyboardInterrupt:
    # Ctr + C to interrupt
    print("Script terminated.")


0: 384x640 1 tv, 1 laptop, 149.6ms
Speed: 5.9ms preprocess, 149.6ms inference, 3.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 tv, 1 laptop, 78.1ms
Speed: 4.7ms preprocess, 78.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 tv, 81.1ms
Speed: 3.3ms preprocess, 81.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 tv, 85.5ms
Speed: 4.1ms preprocess, 85.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 tv, 80.4ms
Speed: 3.1ms preprocess, 80.4ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 81.0ms
Speed: 4.0ms preprocess, 81.0ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 99.9ms
Speed: 11.1ms preprocess, 99.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 88.0ms
Speed: 3.5ms preprocess, 88.0ms inference, 1.0ms postprocess per 