In [None]:
from IPython import display
display.clear_output()

import cv2
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import roboflow

import ultralytics
from ultralytics import YOLO
ultralytics.checks()

In [None]:
# On Apple Silicon, check if MPS is available
import torch
print(torch.backends.mps.is_available())

In [None]:
# Verify if CUDA is available and print the number of GPUs
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

# Treino de um modelo

In [None]:
# download the dataset from roboflow, after labeling the images and creating the dataset

roboflow.login()

# create a file with the roboflow api key, or simply replace below
api_key = os.getenv("ROBOFLOW_PRIVATE_API_KEY")  # this is the PrivateAPIKey from the roboflow site settings

rf = roboflow.Roboflow(api_key)

# replace with your workspace and project name (you can find this in the project url on roboflow)
project = rf.workspace("public-vtwkd").project("pokemmo-tx0hm")  # must be all lowercase apparently

# if dataset version > 1, replace with the corresponding version
dataset = project.version(2).download(model_format="yolov8", location="dataset")  # location is the path where the dataset will be saved
# WARN: you need to check the paths in the data.yaml file after it is downloaded

In [None]:
# train the model
# list of pre-trained models available at https://docs.ultralytics.com/models/yolov8/#performance-metrics
model = YOLO("yolov8s.pt")  # load the pre-trained model you downloaded

# Train the model
results = model.train(data='dataset/data.yaml', epochs=20, imgsz=640, device="cpu") # intel/windows

# Inferência

In [None]:
# select the best version of the fine-tuned model
model = YOLO("runs/detect/train/weights/best.pt")

In [None]:
# predict on new images
confidence_level = 0.1
input_path = 'captured_images'
output_path = 'detections'
class_names = model.names

for file in os.listdir(input_path):
    if file.lower().endswith((".png")) or file.lower().endswith((".jpg")) or file.lower().endswith((".jpeg")):
        image = cv2.imread(os.path.join(input_path, file))
        results = model.predict(source=image, conf=confidence_level)  # generate predictions above a certain confidence, and save images

        output_filename = f"prediction_{file}"
        output_filepath = os.path.join(output_path, output_filename)

        for result in results:
            result.save(filename=output_filepath)
            print("==== Prediction Results ====")
            print("Image: " + os.path.join(input_path, file))
            boxes = result.boxes.xyxy.cpu().numpy()  # Bounding box coordinates (x_min, y_min, x_max, y_max)
            scores = result.boxes.conf.cpu().numpy()  # Confidence score
            labels = result.boxes.cls.cpu().numpy()  # Class index

            for i in range(len(boxes)):
                class_id = labels[i]
                class_label = class_names[class_id] if class_id in class_names else "Unknown"

                print(f"--- Object {i+1} ---")
                print(f"Class: {class_label} (ID: {class_id})")
                print(f"Bounding Box Coordinates: {boxes[i]}")
                print(f"Confidence: {scores[i]:.4f}")
                print("-------------------")

            print("\n")


# Inferência em tempo real

In [None]:
import os
import mss
import cv2
import numpy as np
import time

# xhost + to allow access to the display on Linux
os.environ['DISPLAY'] = ':0'  # to avoid display error on Linux
import pyautogui

In [None]:
# Load the model
model = YOLO("runs/detect/train/weights/best.pt")

screen_width, screen_height = pyautogui.size()

# Function to capture the screen and return the image
def capture_screen():
    with mss.mss() as sct:
        screenshot = sct.grab(sct.monitors[1])  # Capture from the main monitor
        img = np.array(screenshot)  # Convert to image
        img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR) # Convert BGRA to BGR
        img_height, img_width, _ = img.shape

        # Optionally, save the captured image
        timestamp = time.strftime("%Y%m%d-%H%M%S-%f")  # Include microseconds
        img_path = os.path.join('captured_images/pyautogui', f"capture_{timestamp}.jpg")
        cv2.imwrite(img_path, img)

        return img, timestamp, img_width, img_height

while True:
    img, timestamp, img_width, img_height = capture_screen()
 
    #results = model.predict(source=img, save=True, save_txt=True, conf=0.1) 
    results = model(img)

    # Extract detections (bounding boxes)
    detections = results[0].boxes.xyxy  # Bounding boxes (x1, y1, x2, y2)

    if len(detections) > 0:
        print(f"Detected {len(detections)} objects.")

        # Optionally, save image with detections
        annotated_frame = results[0].plot()  # Draw bounding boxes on the image
        result_path = os.path.join('detections', f"result_{timestamp}.jpg")
        cv2.imwrite(result_path, annotated_frame)

        for i, (x1, y1, x2, y2) in enumerate(detections.tolist()):
            # Calculate the center of the object
            center_x = int((x1 + x2) / 2)
            center_y = int((y1 + y2) / 2)

            # Convert coordinates
            scaled_x = int((center_x / img_width) * screen_width)
            scaled_y = int((center_y / img_height) * screen_height)

            # Move the mouse
            pyautogui.moveTo(scaled_x, scaled_y, duration=0.3)
            pyautogui.click()
            print(f"Moved to object {i+1} at ({scaled_x}, {scaled_y})")

            # Brief pause between objects
            time.sleep(0.25)  # Adjust delay as needed

    time.sleep(3)
